diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..1c81f286 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-08-31T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.16911v1","updated":"2023-08-31T17:59:46Z","published":"2023-08-31T17:59:46Z","title":"PointLLM: Empowering Large Language Models to Understand Point Clouds","summary":" The unprecedented advancements in Large Language Models (LLMs) have created a\nprofound impact on natural language processing but are yet to fully embrace the\nrealm of 3D understanding. This paper introduces PointLLM, a preliminary effort\nto fill this gap, thereby enabling LLMs to understand point clouds and offering\na new avenue beyond 2D visual data. PointLLM processes colored object point\nclouds with human instructions and generates contextually appropriate\nresponses, illustrating its grasp of point clouds and common sense.\nSpecifically, it leverages a point cloud encoder with a powerful LLM to\neffectively fuse geometric, appearance, and linguistic information. We collect\na novel dataset comprising 660K simple and 70K complex point-text instruction\npairs to enable a two-stage training strategy: initially aligning latent spaces\nand subsequently instruction-tuning the unified model. To rigorously evaluate\nour model's perceptual abilities and its generalization capabilities, we\nestablish two benchmarks: Generative 3D Object Classification and 3D Object\nCaptioning, assessed through three different methods, including human\nevaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment\nresults show that PointLLM demonstrates superior performance over existing 2D\nbaselines. Remarkably, in human-evaluated object captioning tasks, PointLLM\noutperforms human annotators in over 50% of the samples. Codes, datasets, and\nbenchmarks are available at https://github.com/OpenRobotLab/PointLLM .\n","authors":["Runsen Xu","Xiaolong Wang","Tai Wang","Yilun Chen","Jiangmiao Pang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.16911v1.pdf","comment":"19 pages. Empowering large language models with 3D point cloud\n understanding, accompanied by a novel dataset and carefully designed\n benchmarks. Project page: https://runsenxu.com/projects/PointLLM"},{"id":"http://arxiv.org/abs/2308.16898v1","updated":"2023-08-31T17:57:50Z","published":"2023-08-31T17:57:50Z","title":"Transformers as Support Vector Machines","summary":" Since its inception in \"Attention Is All You Need\", transformer architecture\nhas led to revolutionary advancements in NLP. The attention layer within the\ntransformer admits a sequence of input tokens $X$ and makes them interact\nthrough pairwise similarities computed as softmax$(XQK^\\top X^\\top)$, where\n$(K,Q)$ are the trainable key-query parameters. In this work, we establish a\nformal equivalence between the optimization geometry of self-attention and a\nhard-margin SVM problem that separates optimal input tokens from non-optimal\ntokens using linear constraints on the outer-products of token pairs. This\nformalism allows us to characterize the implicit bias of 1-layer transformers\noptimized with gradient descent: (1) Optimizing the attention layer with\nvanishing regularization, parameterized by $(K,Q)$, converges in direction to\nan SVM solution minimizing the nuclear norm of the combined parameter\n$W=KQ^\\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm\nobjective. We characterize this convergence, highlighting that it can occur\ntoward locally-optimal directions rather than global ones. (2) Complementing\nthis, we prove the local/global directional convergence of gradient descent\nunder suitable geometric conditions. Importantly, we show that\nover-parameterization catalyzes global convergence by ensuring the feasibility\nof the SVM problem and by guaranteeing a benign optimization landscape devoid\nof stationary points. (3) While our theory applies primarily to linear\nprediction heads, we propose a more general SVM equivalence that predicts the\nimplicit bias with nonlinear heads. Our findings are applicable to arbitrary\ndatasets and their validity is verified via experiments. We also introduce\nseveral open problems and research directions. We believe these findings\ninspire the interpretation of transformers as a hierarchy of SVMs that\nseparates and selects optimal tokens.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Christos Thrampoulidis","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2308.16898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16890v1","updated":"2023-08-31T17:52:04Z","published":"2023-08-31T17:52:04Z","title":"TouchStone: Evaluating Vision-Language Models by Language Models","summary":" Large vision-language models (LVLMs) have recently witnessed rapid\nadvancements, exhibiting a remarkable capacity for perceiving, understanding,\nand processing visual information by connecting visual receptor with large\nlanguage models (LLMs). However, current assessments mainly focus on\nrecognizing and reasoning abilities, lacking direct evaluation of\nconversational skills and neglecting visual storytelling abilities. In this\npaper, we propose an evaluation method that uses strong LLMs as judges to\ncomprehensively evaluate the various abilities of LVLMs. Firstly, we construct\na comprehensive visual dialogue dataset TouchStone, consisting of open-world\nimages and questions, covering five major categories of abilities and 27\nsubtasks. This dataset not only covers fundamental recognition and\ncomprehension but also extends to literary creation. Secondly, by integrating\ndetailed image annotations we effectively transform the multimodal input\ncontent into a form understandable by LLMs. This enables us to employ advanced\nLLMs for directly evaluating the quality of the multimodal dialogue without\nrequiring human intervention. Through validation, we demonstrate that powerful\nLVLMs, such as GPT-4, can effectively score dialogue quality by leveraging\ntheir textual capabilities alone, aligning with human preferences. We hope our\nwork can serve as a touchstone for LVLMs' evaluation and pave the way for\nbuilding stronger LVLMs. The evaluation code is available at\nhttps://github.com/OFA-Sys/TouchStone.\n","authors":["Shuai Bai","Shusheng Yang","Jinze Bai","Peng Wang","Xingxuan Zhang","Junyang Lin","Xinggang Wang","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16890v1.pdf","comment":"https://github.com/OFA-Sys/TouchStone"},{"id":"http://arxiv.org/abs/2308.16884v1","updated":"2023-08-31T17:43:08Z","published":"2023-08-31T17:43:08Z","title":"The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122\n Language Variants","summary":" We present Belebele, a multiple-choice machine reading comprehension (MRC)\ndataset spanning 122 language variants. Significantly expanding the language\ncoverage of natural language understanding (NLU) benchmarks, this dataset\nenables the evaluation of text models in high-, medium-, and low-resource\nlanguages. Each question is based on a short passage from the Flores-200\ndataset and has four multiple-choice answers. The questions were carefully\ncurated to discriminate between models with different levels of general\nlanguage comprehension. The English dataset on its own proves difficult enough\nto challenge state-of-the-art language models. Being fully parallel, this\ndataset enables direct comparison of model performance across all languages. We\nuse this dataset to evaluate the capabilities of multilingual masked language\nmodels (MLMs) and large language models (LLMs). We present extensive results\nand find that despite significant cross-lingual transfer in English-centric\nLLMs, much smaller MLMs pretrained on balanced multilingual data still\nunderstand far more languages. We also observe that larger vocabulary size and\nconscious vocabulary construction correlate with better performance on\nlow-resource languages. Overall, Belebele opens up new avenues for evaluating\nand analyzing the multilingual capabilities of NLP systems.\n","authors":["Lucas Bandarkar","Davis Liang","Benjamin Muller","Mikel Artetxe","Satya Narayan Shukla","Donald Husa","Naman Goyal","Abhinandan Krishnan","Luke Zettlemoyer","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2308.16884v1.pdf","comment":"27 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.16871v1","updated":"2023-08-31T17:20:50Z","published":"2023-08-31T17:20:50Z","title":"The Gender-GAP Pipeline: A Gender-Aware Polyglot Pipeline for Gender\n Characterisation in 55 Languages","summary":" Gender biases in language generation systems are challenging to mitigate. One\npossible source for these biases is gender representation disparities in the\ntraining and evaluation data. Despite recent progress in documenting this\nproblem and many attempts at mitigating it, we still lack shared methodology\nand tooling to report gender representation in large datasets. Such\nquantitative reporting will enable further mitigation, e.g., via data\naugmentation. This paper describes the Gender-GAP Pipeline (for Gender-Aware\nPolyglot Pipeline), an automatic pipeline to characterize gender representation\nin large-scale datasets for 55 languages. The pipeline uses a multilingual\nlexicon of gendered person-nouns to quantify the gender representation in text.\nWe showcase it to report gender representation in WMT training data and\ndevelopment data for the News task, confirming that current data is skewed\ntowards masculine representation. Having unbalanced datasets may indirectly\noptimize our systems towards outperforming one gender over the others. We\nsuggest introducing our gender quantification pipeline in current datasets and,\nideally, modifying them toward a balanced representation.\n","authors":["Benjamin Muller","Belen Alastruey","Prangthip Hansanti","Elahe Kalbassi","Christophe Ropers","Eric Michael Smith","Adina Williams","Luke Zettlemoyer","Pierre Andrews","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2308.16871v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2307.11764v2","updated":"2023-08-31T17:09:23Z","published":"2023-07-14T17:24:15Z","title":"Sensi-BERT: Towards Sensitivity Driven Fine-Tuning for\n Parameter-Efficient BERT","summary":" Large pre-trained language models have recently gained significant traction\ndue to their improved performance on various down-stream tasks like text\nclassification and question answering, requiring only few epochs of\nfine-tuning. However, their large model sizes often prohibit their applications\non resource-constrained edge devices. Existing solutions of yielding\nparameter-efficient BERT models largely rely on compute-exhaustive training and\nfine-tuning. Moreover, they often rely on additional compute heavy models to\nmitigate the performance gap. In this paper, we present Sensi-BERT, a\nsensitivity driven efficient fine-tuning of BERT models that can take an\noff-the-shelf pre-trained BERT model and yield highly parameter-efficient\nmodels for downstream tasks. In particular, we perform sensitivity analysis to\nrank each individual parameter tensor, that then is used to trim them\naccordingly during fine-tuning for a given parameter or FLOPs budget. Our\nexperiments show the efficacy of Sensi-BERT across different downstream tasks\nincluding MNLI, QQP, QNLI, SST-2 and SQuAD, showing better performance at\nsimilar or smaller parameter budget compared to various alternatives.\n","authors":["Souvik Kundu","Sharath Nittur Sridhar","Maciej Szankin","Sairam Sundaresan"],"pdf_url":"https://arxiv.org/pdf/2307.11764v2.pdf","comment":"6 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.16824v1","updated":"2023-08-31T15:53:51Z","published":"2023-08-31T15:53:51Z","title":"Can Programming Languages Boost Each Other via Instruction Tuning?","summary":" When human programmers have mastered a programming language, it would be\neasier when they learn a new programming language. In this report, we focus on\nexploring whether programming languages can boost each other during the\ninstruction fine-tuning phase of code large language models. We conduct\nextensive experiments of 8 popular programming languages (Python, JavaScript,\nTypeScript, C, C++, Java, Go, HTML) on StarCoder. Results demonstrate that\nprogramming languages can significantly improve each other. For example,\nCodeM-Python 15B trained on Python is able to increase Java by an absolute\n17.95% pass@1 on HumanEval-X. More surprisingly, we found that CodeM-HTML 7B\ntrained on the HTML corpus can improve Java by an absolute 15.24% pass@1. Our\ntraining data is released at https://github.com/NL2Code/CodeM.\n","authors":["Daoguang Zan","Ailun Yu","Bo Shen","Jiaxin Zhang","Taihong Chen","Bing Geng","Bei Chen","Jichuan Ji","Yafen Yao","Yongji Wang","Qianxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16824v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2211.11483v3","updated":"2023-08-31T15:43:56Z","published":"2022-11-21T14:18:25Z","title":"Deanthropomorphising NLP: Can a Language Model Be Conscious?","summary":" This work is intended as a voice in the discussion over previous claims that\na pretrained large language model (LLM) based on the Transformer model\narchitecture can be sentient. Such claims have been made concerning the LaMDA\nmodel and also concerning the current wave of LLM-powered chatbots, such as\nChatGPT. This claim, if confirmed, would have serious ramifications in the\nNatural Language Processing (NLP) community due to wide-spread use of similar\nmodels. However, here we take the position that such a large language model\ncannot be sentient, or conscious, and that LaMDA in particular exhibits no\nadvances over other similar models that would qualify it. We justify this by\nanalysing the Transformer architecture through Integrated Information Theory of\nconsciousness. We see the claims of sentience as part of a wider tendency to\nuse anthropomorphic language in NLP reporting. Regardless of the veracity of\nthe claims, we consider this an opportune moment to take stock of progress in\nlanguage modelling and consider the ethical implications of the task. In order\nto make this work helpful for readers outside the NLP community, we also\npresent the necessary background in language modelling.\n","authors":["Matthew Shardlow","Piotr Przybyła"],"pdf_url":"https://arxiv.org/pdf/2211.11483v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09312v2","updated":"2023-08-31T15:32:01Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks, such as Reddit discussions. In contrast to traditional comment-only\nmethods, our approach to labelling a comment as hate speech involves a holistic\nanalysis of text and images grounded in the discussion context. This is done by\nleveraging graph transformers to capture the contextual relationships in the\nentire discussion surrounding a comment and grounding the interwoven fusion\nlayers that combine individual comments' text and image embeddings instead of\nprocessing modalities separately. We compare the performance of our model to\nbaselines that only process individual comments and conduct extensive ablation\nstudies. To evaluate our work, we present a new dataset, HatefulDiscussions,\ncomprising complete multi-modal discussions from multiple online communities on\nReddit. We conclude with future work for multimodal solutions to deliver social\nvalue in online contexts, arguing that capturing a holistic view of a\nconversation significantly advances the effort to detect anti-social behaviour.\n","authors":["Liam Hebert","Gaurav Sahu","Yuxuan Guo","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2308.16797v1","updated":"2023-08-31T15:19:28Z","published":"2023-08-31T15:19:28Z","title":"Simple LLM Prompting is State-of-the-Art for Robust and Multilingual\n Dialogue Evaluation","summary":" Despite significant research effort in the development of automatic dialogue\nevaluation metrics, little thought is given to evaluating dialogues other than\nin English. At the same time, ensuring metrics are invariant to semantically\nsimilar responses is also an overlooked topic. In order to achieve the desired\nproperties of robustness and multilinguality for dialogue evaluation metrics,\nwe propose a novel framework that takes advantage of the strengths of current\nevaluation models with the newly-established paradigm of prompting Large\nLanguage Models (LLMs). Empirical results show our framework achieves state of\nthe art results in terms of mean Spearman correlation scores across several\nbenchmarks and ranks first place on both the Robust and Multilingual tasks of\nthe DSTC11 Track 4 \"Automatic Evaluation Metrics for Open-Domain Dialogue\nSystems\", proving the evaluation capabilities of prompted LLMs.\n","authors":["John Mendonça","Patrícia Pereira","João Paulo Carvalho","Alon Lavie","Isabel Trancoso"],"pdf_url":"https://arxiv.org/pdf/2308.16797v1.pdf","comment":"DSTC11 best paper for Track 4"},{"id":"http://arxiv.org/abs/2308.16795v1","updated":"2023-08-31T15:15:26Z","published":"2023-08-31T15:15:26Z","title":"Towards Multilingual Automatic Dialogue Evaluation","summary":" The main limiting factor in the development of robust multilingual dialogue\nevaluation metrics is the lack of multilingual data and the limited\navailability of open sourced multilingual dialogue systems. In this work, we\npropose a workaround for this lack of data by leveraging a strong multilingual\npretrained LLM and augmenting existing English dialogue data using Machine\nTranslation. We empirically show that the naive approach of finetuning a\npretrained multilingual encoder model with translated data is insufficient to\noutperform the strong baseline of finetuning a multilingual model with only\nsource data. Instead, the best approach consists in the careful curation of\ntranslated data using MT Quality Estimation metrics, excluding low quality\ntranslations that hinder its performance.\n","authors":["John Mendonça","Alon Lavie","Isabel Trancoso"],"pdf_url":"https://arxiv.org/pdf/2308.16795v1.pdf","comment":"SIGDIAL23"},{"id":"http://arxiv.org/abs/2308.16770v1","updated":"2023-08-31T14:47:00Z","published":"2023-08-31T14:47:00Z","title":"Enhancing PLM Performance on Labour Market Tasks via Instruction-based\n Finetuning and Prompt-tuning with Rules","summary":" The increased digitization of the labour market has given researchers,\neducators, and companies the means to analyze and better understand the labour\nmarket. However, labour market resources, although available in high volumes,\ntend to be unstructured, and as such, research towards methodologies for the\nidentification, linking, and extraction of entities becomes more and more\nimportant. Against the backdrop of this quest for better labour market\nrepresentations, resource constraints and the unavailability of large-scale\nannotated data cause a reliance on human domain experts. We demonstrate the\neffectiveness of prompt-based tuning of pre-trained language models (PLM) in\nlabour market specific applications. Our results indicate that cost-efficient\nmethods such as PTR and instruction tuning without exemplars can significantly\nincrease the performance of PLMs on downstream labour market applications\nwithout introducing additional model layers, manual annotations, and data\naugmentation.\n","authors":["Jarno Vrolijk","David Graus"],"pdf_url":"https://arxiv.org/pdf/2308.16770v1.pdf","comment":"accepted for publication at RecSys in HR 2023"},{"id":"http://arxiv.org/abs/2308.16763v1","updated":"2023-08-31T14:31:48Z","published":"2023-08-31T14:31:48Z","title":"Ladder-of-Thought: Using Knowledge as Steps to Elevate Stance Detection","summary":" Chain-of-Thought Prompting (CoT) reinforces the reasoning capabilities of\nLarge Language Models (LLMs) through the generation of intermediate rationales.\nHowever, these enhancements predominantly benefit large-scale models, leaving\nsmall LMs without significant performance improvements when directly applying\nCoT. Despite the advanced reasoning capabilities of LLMs, CoT relies primarily\non their pre-trained internal knowledge. The external knowledge that is\npreviously unknown to the model remains unexploited. This omission becomes\npronounced in tasks such as stance detection, where the external background\nknowledge plays a pivotal role. Additionally, the large-scale architecture of\nLLMs inevitably present efficiency challenges during deployment. To address\nthese challenges, we introduce the Ladder-of-Thought (LoT) for stance\ndetection. Grounded in a dual-phase Cascaded Optimization framework, LoT\ndirects the model to incorporate high-quality external knowledge, enhancing the\nintermediate rationales it generates. These bolstered rationales subsequently\nserve as the foundation for more precise predictions - akin to how a ladder\nfacilitates reaching elevated goals. LoT achieves a balance between efficiency\nand accuracy, making it an adaptable and efficient framework for stance\ndetection. Our empirical evaluations underscore LoT's effectiveness, marking a\n16% improvement over ChatGPT and a 10% enhancement compared to ChatGPT with\nCoT.\n","authors":["Kairui Hu","Ming Yan","Joey Tianyi Zhou","Ivor W. Tsang","Wen Haw Chong","Yong Keong Yap"],"pdf_url":"https://arxiv.org/pdf/2308.16763v1.pdf","comment":"5 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.10811v2","updated":"2023-08-31T14:13:31Z","published":"2023-07-20T16:55:25Z","title":"\"It Felt Like Having a Second Mind\": Investigating Human-AI\n Co-creativity in Prewriting with Large Language Models","summary":" Prewriting is the process of discovering and developing ideas before a first\ndraft, which requires divergent thinking and often implies unstructured\nstrategies such as diagramming, outlining, free-writing, etc. Although large\nlanguage models (LLMs) have been demonstrated to be useful for a variety of\ntasks including creative writing, little is known about how users would\ncollaborate with LLMs to support prewriting. The preferred collaborative role\nand initiative of LLMs during such a creativity process is also unclear. To\ninvestigate human-LLM collaboration patterns and dynamics during prewriting, we\nconducted a three-session qualitative study with 15 participants in two\ncreative tasks: story writing and slogan writing. The findings indicated that\nduring collaborative prewriting, there appears to be a three-stage iterative\nHuman-AI Co-creativity process that includes Ideation, Illumination, and\nImplementation stages. This collaborative process champions the human in a\ndominant role, in addition to mixed and shifting levels of initiative that\nexist between humans and LLMs. This research also reports on collaboration\nbreakdowns that occur during this process, user perceptions of using existing\nLLMs during Human-AI Co-creativity, and discusses design implications to\nsupport this co-creativity process.\n","authors":["Qian Wan","Siying Hu","Yu Zhang","Piaohong Wang","Bo Wen","Zhicong Lu"],"pdf_url":"https://arxiv.org/pdf/2307.10811v2.pdf","comment":"Under Review; 25 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.06566v4","updated":"2023-08-31T13:43:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16705v1","updated":"2023-08-31T13:14:47Z","published":"2023-08-31T13:14:47Z","title":"CReHate: Cross-cultural Re-annotation of English Hate Speech Dataset","summary":" English datasets predominantly reflect the perspectives of certain\nnationalities, which can lead to cultural biases in models and datasets. This\nis particularly problematic in tasks heavily influenced by subjectivity, such\nas hate speech detection. To delve into how individuals from different\ncountries perceive hate speech, we introduce CReHate, a cross-cultural\nre-annotation of the sampled SBIC dataset. This dataset includes annotations\nfrom five distinct countries: Australia, Singapore, South Africa, the United\nKingdom, and the United States. Our thorough statistical analysis highlights\nsignificant differences based on nationality, with only 59.4% of the samples\nachieving consensus among all countries. We also introduce a culturally\nsensitive hate speech classifier via transfer learning, adept at capturing\nperspectives of different nationalities. These findings underscore the need to\nre-evaluate certain aspects of NLP research, especially with regard to the\nnuanced nature of hate speech in the English language.\n","authors":["Nayeon Lee","Chani Jung","Junho Myung","Jiho Jin","Juho Kim","Alice Oh"],"pdf_url":"https://arxiv.org/pdf/2308.16705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16692v1","updated":"2023-08-31T12:53:09Z","published":"2023-08-31T12:53:09Z","title":"SpeechTokenizer: Unified Speech Tokenizer for Speech Large Language\n Models","summary":" Current speech large language models build upon discrete speech\nrepresentations, which can be categorized into semantic tokens and acoustic\ntokens. However, existing speech tokens are not specifically designed for\nspeech language modeling. To assess the suitability of speech tokens for\nbuilding speech language models, we established the first benchmark,\nSLMTokBench. Our results indicate that neither semantic nor acoustic tokens are\nideal for this purpose. Therefore, we propose SpeechTokenizer, a unified speech\ntokenizer for speech large language models. SpeechTokenizer adopts the\nEncoder-Decoder architecture with residual vector quantization (RVQ). Unifying\nsemantic and acoustic tokens, SpeechTokenizer disentangles different aspects of\nspeech information hierarchically across different RVQ layers. Furthermore, We\nconstruct a Unified Speech Language Model (USLM) leveraging SpeechTokenizer.\nExperiments show that SpeechTokenizer performs comparably to EnCodec in speech\nreconstruction and demonstrates strong performance on the SLMTokBench\nbenchmark. Also, USLM outperforms VALL-E in zero-shot Text-to-Speech tasks.\nCode and models are available at\nhttps://github.com/ZhangXInFD/SpeechTokenizer/.\n","authors":["Xin Zhang","Dong Zhang","Shimin Li","Yaqian Zhou","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.16692v1.pdf","comment":"SpeechTokenizer project page is\n https://0nutation.github.io/SpeechTokenizer.github.io/"},{"id":"http://arxiv.org/abs/2308.16688v1","updated":"2023-08-31T12:45:53Z","published":"2023-08-31T12:45:53Z","title":"Using Large Language Models to Automate Category and Trend Analysis of\n Scientific Articles: An Application in Ophthalmology","summary":" Purpose: In this paper, we present an automated method for article\nclassification, leveraging the power of Large Language Models (LLM). The\nprimary focus is on the field of ophthalmology, but the model is extendable to\nother fields. Methods: We have developed a model based on Natural Language\nProcessing (NLP) techniques, including advanced LLMs, to process and analyze\nthe textual content of scientific papers. Specifically, we have employed\nzero-shot learning (ZSL) LLM models and compared against Bidirectional and\nAuto-Regressive Transformers (BART) and its variants, and Bidirectional Encoder\nRepresentations from Transformers (BERT), and its variant such as distilBERT,\nSciBERT, PubmedBERT, BioBERT. Results: The classification results demonstrate\nthe effectiveness of LLMs in categorizing large number of ophthalmology papers\nwithout human intervention. Results: To evalute the LLMs, we compiled a dataset\n(RenD) of 1000 ocular disease-related articles, which were expertly annotated\nby a panel of six specialists into 15 distinct categories. The model achieved\nmean accuracy of 0.86 and mean F1 of 0.85 based on the RenD dataset.\nConclusion: The proposed framework achieves notable improvements in both\naccuracy and efficiency. Its application in the domain of ophthalmology\nshowcases its potential for knowledge organization and retrieval in other\ndomains too. We performed trend analysis that enables the researchers and\nclinicians to easily categorize and retrieve relevant papers, saving time and\neffort in literature review and information gathering as well as identification\nof emerging scientific trends within different disciplines. Moreover, the\nextendibility of the model to other scientific fields broadens its impact in\nfacilitating research and trend analysis across diverse disciplines.\n","authors":["Hina Raja","Asim Munawar","Mohammad Delsoz","Mohammad Elahi","Yeganeh Madadi","Amr Hassan","Hashem Abu Serhan","Onur Inam","Luis Hermandez","Sang Tran","Wuqas Munir","Alaa Abd-Alrazaq","Hao Chen"," SiamakYousefi"],"pdf_url":"https://arxiv.org/pdf/2308.16688v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16687v1","updated":"2023-08-31T12:43:18Z","published":"2023-08-31T12:43:18Z","title":"DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew","summary":" We present DictaBERT, a new state-of-the-art pre-trained BERT model for\nmodern Hebrew, outperforming existing models on most benchmarks. Additionally,\nwe release two fine-tuned versions of the model, designed to perform two\nspecific foundational tasks in the analysis of Hebrew texts: prefix\nsegmentation and morphological tagging. These fine-tuned models allow any\ndeveloper to perform prefix segmentation and morphological tagging of a Hebrew\nsentence with a single call to a HuggingFace model, without the need to\nintegrate any additional libraries or code. In this paper we describe the\ndetails of the training as well and the results on the different benchmarks. We\nrelease the models to the community, along with sample code demonstrating their\nuse. We release these models as part of our goal to help further research and\ndevelopment in Hebrew NLP.\n","authors":["Shaltiel Shmidman","Avi Shmidman","Moshe Koppel"],"pdf_url":"https://arxiv.org/pdf/2308.16687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07462v2","updated":"2023-08-31T11:09:16Z","published":"2023-08-14T21:19:44Z","title":"Playing with Words: Comparing the Vocabulary and Lexical Richness of\n ChatGPT and Humans","summary":" The introduction of Artificial Intelligence (AI) generative language models\nsuch as GPT (Generative Pre-trained Transformer) and tools such as ChatGPT has\ntriggered a revolution that can transform how text is generated. This has many\nimplications, for example, as AI-generated text becomes a significant fraction\nof the text, would this have an effect on the language capabilities of readers\nand also on the training of newer AI tools? Would it affect the evolution of\nlanguages? Focusing on one specific aspect of the language: words; will the use\nof tools such as ChatGPT increase or reduce the vocabulary used or the lexical\nrichness? This has implications for words, as those not included in\nAI-generated content will tend to be less and less popular and may eventually\nbe lost. In this work, we perform an initial comparison of the vocabulary and\nlexical richness of ChatGPT and humans when performing the same tasks. In more\ndetail, two datasets containing the answers to different types of questions\nanswered by ChatGPT and humans, and a third dataset in which ChatGPT\nparaphrases sentences and questions are used. The analysis shows that ChatGPT\ntends to use fewer distinct words and lower lexical richness than humans. These\nresults are very preliminary and additional datasets and ChatGPT configurations\nhave to be evaluated to extract more general conclusions. Therefore, further\nresearch is needed to understand how the use of ChatGPT and more broadly\ngenerative AI tools will affect the vocabulary and lexical richness in\ndifferent types of text and languages.\n","authors":["Pedro Reviriego","Javier Conde","Elena Merino-Gómez","Gonzalo Martínez","José Alberto Hernández"],"pdf_url":"https://arxiv.org/pdf/2308.07462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16622v1","updated":"2023-08-31T10:31:19Z","published":"2023-08-31T10:31:19Z","title":"Developing a Scalable Benchmark for Assessing Large Language Models in\n Knowledge Graph Engineering","summary":" As the field of Large Language Models (LLMs) evolves at an accelerated pace,\nthe critical need to assess and monitor their performance emerges. We introduce\na benchmarking framework focused on knowledge graph engineering (KGE)\naccompanied by three challenges addressing syntax and error correction, facts\nextraction and dataset generation. We show that while being a useful tool, LLMs\nare yet unfit to assist in knowledge graph generation with zero-shot prompting.\nConsequently, our LLM-KG-Bench framework provides automatic evaluation and\nstorage of LLM responses as well as statistical data and visualization tools to\nsupport tracking of prompt engineering and model performance.\n","authors":["Lars-Peter Meyer","Johannes Frey","Kurt Junghanns","Felix Brei","Kirill Bulert","Sabine Gründer-Fahrer","Michael Martin"],"pdf_url":"https://arxiv.org/pdf/2308.16622v1.pdf","comment":"To be published in SEMANTICS 2023 poster track proceedings. SEMANTICS\n 2023 EU: 19th International Conference on Semantic Systems, September 20-22,\n 2023, Leipzig, Germany"},{"id":"http://arxiv.org/abs/2308.16593v1","updated":"2023-08-31T09:50:33Z","published":"2023-08-31T09:50:33Z","title":"Towards Spontaneous Style Modeling with Semi-supervised Pre-training for\n Conversational Text-to-Speech Synthesis","summary":" The spontaneous behavior that often occurs in conversations makes speech more\nhuman-like compared to reading-style. However, synthesizing spontaneous-style\nspeech is challenging due to the lack of high-quality spontaneous datasets and\nthe high cost of labeling spontaneous behavior. In this paper, we propose a\nsemi-supervised pre-training method to increase the amount of spontaneous-style\nspeech and spontaneous behavioral labels. In the process of semi-supervised\nlearning, both text and speech information are considered for detecting\nspontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is\nused to model the relationship between each sentence in the conversation.\nExperimental results indicate that our proposed method achieves superior\nexpressive speech synthesis performance with the ability to model spontaneous\nbehavior in spontaneous-style speech and predict reasonable spontaneous\nbehavior from text.\n","authors":["Weiqin Li","Shun Lei","Qiaochu Huang","Yixuan Zhou","Zhiyong Wu","Shiyin Kang","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16593v1.pdf","comment":"Accepted by INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2307.01458v2","updated":"2023-08-31T09:39:29Z","published":"2023-07-04T03:34:19Z","title":"CARE-MI: Chinese Benchmark for Misinformation Evaluation in Maternity\n and Infant Care","summary":" The recent advances in natural language processing (NLP), have led to a new\ntrend of applying large language models (LLMs) to real-world scenarios. While\nthe latest LLMs are astonishingly fluent when interacting with humans, they\nsuffer from the misinformation problem by unintentionally generating factually\nfalse statements. This can lead to harmful consequences, especially when\nproduced within sensitive contexts, such as healthcare. Yet few previous works\nhave focused on evaluating misinformation in the long-form (LF) generation of\nLLMs, especially for knowledge-intensive topics. Moreover, although LLMs have\nbeen shown to perform well in different languages, misinformation evaluation\nhas been mostly conducted in English. To this end, we present a benchmark,\nCARE-MI, for evaluating LLM misinformation in: 1) a sensitive topic,\nspecifically the maternity and infant care domain; and 2) a language other than\nEnglish, namely Chinese. Most importantly, we provide an innovative paradigm\nfor building LF generation evaluation benchmarks that can be transferred to\nother knowledge-intensive domains and low-resourced languages. Our proposed\nbenchmark fills the gap between the extensive usage of LLMs and the lack of\ndatasets for assessing the misinformation generated by these models. It\ncontains 1,612 expert-checked questions, accompanied with human-selected\nreferences. Using our benchmark, we conduct extensive experiments and found\nthat current Chinese LLMs are far from perfect in the topic of maternity and\ninfant care. In an effort to minimize the reliance on human resources for\nperformance evaluation, we offer off-the-shelf judgment models for\nautomatically assessing the LF output of LLMs given benchmark questions.\nMoreover, we compare potential solutions for LF generation evaluation and\nprovide insights for building better automated metrics.\n","authors":["Tong Xiang","Liangzhi Li","Wangyue Li","Mingbai Bai","Lu Wei","Bowen Wang","Noa Garcia"],"pdf_url":"https://arxiv.org/pdf/2307.01458v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16588v1","updated":"2023-08-31T09:35:52Z","published":"2023-08-31T09:35:52Z","title":"Interpreting Sentiment Composition with Latent Semantic Tree","summary":" As the key to sentiment analysis, sentiment composition considers the\nclassification of a constituent via classifications of its contained\nsub-constituents and rules operated on them. Such compositionality has been\nwidely studied previously in the form of hierarchical trees including untagged\nand sentiment ones, which are intrinsically suboptimal in our view. To address\nthis, we propose semantic tree, a new tree form capable of interpreting the\nsentiment composition in a principled way. Semantic tree is a derivation of a\ncontext-free grammar (CFG) describing the specific composition rules on\ndifference semantic roles, which is designed carefully following previous\nlinguistic conclusions. However, semantic tree is a latent variable since there\nis no its annotation in regular datasets. Thus, in our method, it is\nmarginalized out via inside algorithm and learned to optimize the\nclassification performance. Quantitative and qualitative results demonstrate\nthat our method not only achieves better or competitive results compared to\nbaselines in the setting of regular and domain adaptation classification, and\nalso generates plausible tree explanations.\n","authors":["Zhongtao Jiang","Yuanzhe Zhang","Cao Liu","Jiansong Chen","Jun Zhao","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16588v1.pdf","comment":"Findings of ACL2023"},{"id":"http://arxiv.org/abs/2308.16584v1","updated":"2023-08-31T09:29:35Z","published":"2023-08-31T09:29:35Z","title":"Unsupervised Text Style Transfer with Deep Generative Models","summary":" We present a general framework for unsupervised text style transfer with deep\ngenerative models. The framework models each sentence-label pair in the\nnon-parallel corpus as partially observed from a complete quadruplet which\nadditionally contains two latent codes representing the content and style,\nrespectively. These codes are learned by exploiting dependencies inside the\nobserved data. Then a sentence is transferred by manipulating them. Our\nframework is able to unify previous embedding and prototype methods as two\nspecial forms. It also provides a principled perspective to explain previously\nproposed techniques in the field such as aligned encoder and adversarial\ntraining. We further conduct experiments on three benchmarks. Both automatic\nand human evaluation results show that our methods achieve better or\ncompetitive results compared to several strong baselines.\n","authors":["Zhongtao Jiang","Yuanzhe Zhang","Yiming Ju","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16577v1","updated":"2023-08-31T09:19:15Z","published":"2023-08-31T09:19:15Z","title":"Improving Mandarin Prosodic Structure Prediction with Multi-level\n Contextual Information","summary":" For text-to-speech (TTS) synthesis, prosodic structure prediction (PSP) plays\nan important role in producing natural and intelligible speech. Although\ninter-utterance linguistic information can influence the speech interpretation\nof the target utterance, previous works on PSP mainly focus on utilizing\nintrautterance linguistic information of the current utterance only. This work\nproposes to use inter-utterance linguistic information to improve the\nperformance of PSP. Multi-level contextual information, which includes both\ninter-utterance and intrautterance linguistic information, is extracted by a\nhierarchical encoder from character level, utterance level and discourse level\nof the input text. Then a multi-task learning (MTL) decoder predicts prosodic\nboundaries from multi-level contextual information. Objective evaluation\nresults on two datasets show that our method achieves better F1 scores in\npredicting prosodic word (PW), prosodic phrase (PPH) and intonational phrase\n(IPH). It demonstrates the effectiveness of using multi-level contextual\ninformation for PSP. Subjective preference tests also indicate the naturalness\nof synthesized speeches are improved.\n","authors":["Jie Chen","Changhe Song","Deyi Tuo","Xixin Wu","Shiyin Kang","Zhiyong Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16577v1.pdf","comment":"Accepted by Interspeech2022"},{"id":"http://arxiv.org/abs/2308.10959v2","updated":"2023-08-31T09:14:17Z","published":"2023-08-21T18:14:00Z","title":"DocPrompt: Large-scale continue pretrain for zero-shot and few-shot\n document question answering","summary":" In this paper, we propose Docprompt for document question answering tasks\nwith powerful zero-shot and few-shot performance. We proposed a novel weakly\nsupervised data generation method, a novel multl-stage training method and a\nnovel understanding model \\& generation model ensemble method. We achieved\nstate-of-the-art performance on 4 document question answering tasks. This\nmethod greatly improves the delivery efficiency and model performance of\ndocument question answering customer projects, reducing annotation costs and\nlabor costs. Our demo can be found at\nhttps://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout.\n","authors":["Sijin Wu","Dan Zhang","Teng Hu","Shikun Feng"],"pdf_url":"https://arxiv.org/pdf/2308.10959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13916v2","updated":"2023-08-31T08:53:34Z","published":"2023-08-26T16:51:17Z","title":"Exploring Large Language Models for Knowledge Graph Completion","summary":" Knowledge graphs play a vital role in numerous artificial intelligence tasks,\nyet they frequently face the issue of incompleteness. In this study, we explore\nutilizing Large Language Models (LLM) for knowledge graph completion. We\nconsider triples in knowledge graphs as text sequences and introduce an\ninnovative framework called Knowledge Graph LLM (KG-LLM) to model these\ntriples. Our technique employs entity and relation descriptions of a triple as\nprompts and utilizes the response for predictions. Experiments on various\nbenchmark knowledge graphs demonstrate that our method attains state-of-the-art\nperformance in tasks such as triple classification and relation prediction. We\nalso find that fine-tuning relatively smaller models (e.g., LLaMA-7B,\nChatGLM-6B) outperforms recent ChatGPT and GPT-4.\n","authors":["Liang Yao","Jiazhen Peng","Chengsheng Mao","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2308.13916v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2304.11073v3","updated":"2023-08-31T08:51:33Z","published":"2023-04-20T09:30:50Z","title":"OLISIA: a Cascade System for Spoken Dialogue State Tracking","summary":" Though Dialogue State Tracking (DST) is a core component of spoken dialogue\nsystems, recent work on this task mostly deals with chat corpora, disregarding\nthe discrepancies between spoken and written language.In this paper, we propose\nOLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR)\nmodel and a DST model. We introduce several adaptations in the ASR and DST\nmodules to improve integration and robustness to spoken conversations.With\nthese adaptations, our system ranked first in DSTC11 Track 3, a benchmark to\nevaluate spoken DST. We conduct an in-depth analysis of the results and find\nthat normalizing the ASR outputs and adapting the DST inputs through data\naugmentation, along with increasing the pre-trained models size all play an\nimportant role in reducing the performance discrepancy between written and\nspoken conversations.\n","authors":["Léo Jacqmin","Lucas Druart","Yannick Estève","Benoît Favre","Lina Maria Rojas-Barahona","Valentin Vielzeuf"],"pdf_url":"https://arxiv.org/pdf/2304.11073v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16549v1","updated":"2023-08-31T08:40:41Z","published":"2023-08-31T08:40:41Z","title":"Thesis Distillation: Investigating The Impact of Bias in NLP Models on\n Hate Speech Detection","summary":" This paper is a summary of the work in my PhD thesis. In which, I investigate\nthe impact of bias in NLP models on the task of hate speech detection from\nthree perspectives: explainability, offensive stereotyping bias, and fairness.\nI discuss the main takeaways from my thesis and how they can benefit the\nbroader NLP community. Finally, I discuss important future research directions.\nThe findings of my thesis suggest that bias in NLP models impacts the task of\nhate speech detection from all three perspectives. And that unless we start\nincorporating social sciences in studying bias in NLP models, we will not\neffectively overcome the current limitations of measuring and mitigating bias\nin NLP models.\n","authors":["Fatma Elsafoury"],"pdf_url":"https://arxiv.org/pdf/2308.16549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16540v1","updated":"2023-08-31T08:30:20Z","published":"2023-08-31T08:30:20Z","title":"Time-Varying Quasi-Closed-Phase Analysis for Accurate Formant Tracking\n in Speech Signals","summary":" In this paper, we propose a new method for the accurate estimation and\ntracking of formants in speech signals using time-varying quasi-closed-phase\n(TVQCP) analysis. Conventional formant tracking methods typically adopt a\ntwo-stage estimate-and-track strategy wherein an initial set of formant\ncandidates are estimated using short-time analysis (e.g., 10--50 ms), followed\nby a tracking stage based on dynamic programming or a linear state-space model.\nOne of the main disadvantages of these approaches is that the tracking stage,\nhowever good it may be, cannot improve upon the formant estimation accuracy of\nthe first stage. The proposed TVQCP method provides a single-stage formant\ntracking that combines the estimation and tracking stages into one. TVQCP\nanalysis combines three approaches to improve formant estimation and tracking:\n(1) it uses temporally weighted quasi-closed-phase analysis to derive\nclosed-phase estimates of the vocal tract with reduced interference from the\nexcitation source, (2) it increases the residual sparsity by using the $L_1$\noptimization and (3) it uses time-varying linear prediction analysis over long\ntime windows (e.g., 100--200 ms) to impose a continuity constraint on the vocal\ntract model and hence on the formant trajectories. Formant tracking experiments\nwith a wide variety of synthetic and natural speech signals show that the\nproposed TVQCP method performs better than conventional and popular formant\ntracking tools, such as Wavesurfer and Praat (based on dynamic programming),\nthe KARMA algorithm (based on Kalman filtering), and DeepFormants (based on\ndeep neural networks trained in a supervised manner). Matlab scripts for the\nproposed method can be found at: https://github.com/njaygowda/ftrack\n","authors":["Dhananjaya Gowda","Sudarsana Reddy Kadiri","Brad Story","Paavo Alku"],"pdf_url":"https://arxiv.org/pdf/2308.16540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16537v1","updated":"2023-08-31T08:28:11Z","published":"2023-08-31T08:28:11Z","title":"The Smart Data Extractor, a Clinician Friendly Solution to Accelerate\n and Improve the Data Collection During Clinical Trials","summary":" In medical research, the traditional way to collect data, i.e. browsing\npatient files, has been proven to induce bias, errors, human labor and costs.\nWe propose a semi-automated system able to extract every type of data,\nincluding notes. The Smart Data Extractor pre-populates clinic research forms\nby following rules. We performed a cross-testing experiment to compare\nsemi-automated to manual data collection. 20 target items had to be collected\nfor 79 patients. The average time to complete one form was 6'81'' for manual\ndata collection and 3'22'' with the Smart Data Extractor. There were also more\nmistakes during manual data collection (163 for the whole cohort) than with the\nSmart Data Extractor (46 for the whole cohort). We present an easy to use,\nunderstandable and agile solution to fill out clinical research forms. It\nreduces human effort and provides higher quality data, avoiding data re-entry\nand fatigue induced errors.\n","authors":["Sophie Quennelle","Maxime Douillet","Lisa Friedlander","Olivia Boyer","Anita Burgun","Antoine Neuraz","Nicolas Garcelon"],"pdf_url":"https://arxiv.org/pdf/2308.16537v1.pdf","comment":"IOS Press, 2023, Studies in Health Technology and Informatics"},{"id":"http://arxiv.org/abs/2308.16498v1","updated":"2023-08-31T07:00:21Z","published":"2023-08-31T07:00:21Z","title":"Generalised Winograd Schema and its Contextuality","summary":" Ambiguities in natural language give rise to probability distributions over\ninterpretations. The distributions are often over multiple ambiguous words at a\ntime; a multiplicity which makes them a suitable topic for sheaf-theoretic\nmodels of quantum contextuality. Previous research showed that different\nquantitative measures of contextuality correlate well with Psycholinguistic\nresearch on lexical ambiguities. In this work, we focus on coreference\nambiguities and investigate the Winograd Schema Challenge (WSC), a test\nproposed by Levesque in 2011 to evaluate the intelligence of machines. The WSC\nconsists of a collection of multiple-choice questions that require\ndisambiguating pronouns in sentences structured according to the Winograd\nschema, in a way that makes it difficult for machines to determine the correct\nreferents but remains intuitive for human comprehension. In this study, we\npropose an approach that analogously models the Winograd schema as an\nexperiment in quantum physics. However, we argue that the original Winograd\nSchema is inherently too simplistic to facilitate contextuality. We introduce a\nnovel mechanism for generalising the schema, rendering it analogous to a\nBell-CHSH measurement scenario. We report an instance of this generalised\nschema, complemented by the human judgements we gathered via a crowdsourcing\nplatform. The resulting model violates the Bell-CHSH inequality by 0.192, thus\nexhibiting contextuality in a coreference resolution setting.\n","authors":["Kin Ian Lo","Mehrnoosh Sadrzadeh","Shane Mansfield"],"pdf_url":"https://arxiv.org/pdf/2308.16498v1.pdf","comment":"In Proceedings QPL 2023, arXiv:2308.15489"},{"id":"http://arxiv.org/abs/2308.16475v1","updated":"2023-08-31T05:40:14Z","published":"2023-08-31T05:40:14Z","title":"Transformer Compression via Subspace Projection","summary":" We propose TCSP, a novel method for compressing a transformer model by\nfocusing on reducing the hidden size of the model. By projecting the whole\ntransform model into a subspace, we enable matrix operations between the weight\nmatrices in the model and features in a reduced-dimensional space, leading to\nsignificant reductions in model parameters and computing resources. To\nestablish this subspace, we decompose the feature matrix, derived from\ndifferent layers of sampled data instances, into a projection matrix. For\nevaluation, TCSP is applied to compress T5 and BERT models on the GLUE and\nSQuAD benchmarks. Experimental results demonstrate that TCSP achieves a\ncompression ratio of 44\\% with at most 1.6\\% degradation in accuracy,\nsurpassing or matching prior compression methods. Furthermore, TCSP exhibits\ncompatibility with other methods targeting filter and attention head size\ncompression.\n","authors":["Yuxuan Hu","Jing Zhang","Chen Zhao","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.16475v1.pdf","comment":"21 pages, 1 figures"},{"id":"http://arxiv.org/abs/2308.16474v1","updated":"2023-08-31T05:37:21Z","published":"2023-08-31T05:37:21Z","title":"Enhancing Subtask Performance of Multi-modal Large Language Model","summary":" Multi-modal Large Language Model (MLLM) refers to a model expanded from a\nLarge Language Model (LLM) that possesses the capability to handle and infer\nmulti-modal data. Current MLLMs typically begin by using LLMs to decompose\ntasks into multiple subtasks, then employing individual pre-trained models to\ncomplete specific subtasks, and ultimately utilizing LLMs to integrate the\nresults of each subtasks to obtain the results of the task. In real-world\nscenarios, when dealing with large projects, it is common practice to break\ndown the project into smaller sub-projects, with different teams providing\ncorresponding solutions or results. The project owner then decides which\nsolution or result to use, ensuring the best possible outcome for each subtask\nand, consequently, for the entire project. Inspired by this, this study\nconsiders selecting multiple pre-trained models to complete the same subtask.\nBy combining the results from multiple pre-trained models, the optimal subtask\nresult is obtained, enhancing the performance of the MLLM. Specifically, this\nstudy first selects multiple pre-trained models focused on the same subtask\nbased on distinct evaluation approaches, and then invokes these models in\nparallel to process input data and generate corresponding subtask results.\nFinally, the results from multiple pre-trained models for the same subtask are\ncompared using the LLM, and the best result is chosen as the outcome for that\nsubtask. Extensive experiments are conducted in this study using GPT-4\nannotated datasets and human-annotated datasets. The results of various\nevaluation metrics adequately demonstrate the effectiveness of the proposed\napproach in this paper.\n","authors":["Yongqiang Zhao","Zhenyu Li","Feng Zhang","Xinhai Xu","Donghong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16469v1","updated":"2023-08-31T05:25:04Z","published":"2023-08-31T05:25:04Z","title":"Link Prediction for Wikipedia Articles as a Natural Language Inference\n Task","summary":" Link prediction task is vital to automatically understanding the structure of\nlarge knowledge bases. In this paper, we present our system to solve this task\nat the Data Science and Advanced Analytics 2023 Competition \"Efficient and\nEffective Link Prediction\" (DSAA-2023 Competition) with a corpus containing\n948,233 training and 238,265 for public testing. This paper introduces an\napproach to link prediction in Wikipedia articles by formulating it as a\nnatural language inference (NLI) task. Drawing inspiration from recent\nadvancements in natural language processing and understanding, we cast link\nprediction as an NLI task, wherein the presence of a link between two articles\nis treated as a premise, and the task is to determine whether this premise\nholds based on the information presented in the articles. We implemented our\nsystem based on the Sentence Pair Classification for Link Prediction for the\nWikipedia Articles task. Our system achieved 0.99996 Macro F1-score and 1.00000\nMacro F1-score for the public and private test sets, respectively. Our team\nUIT-NLP ranked 3rd in performance on the private test set, equal to the scores\nof the first and second places. Our code is publicly for research purposes.\n","authors":["Chau-Thang Phan","Quoc-Nam Nguyen","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.16469v1.pdf","comment":"Accepted at the 10th IEEE International Conference On Data Science\n And Advanced Analytics (DSAA 2023)"},{"id":"http://arxiv.org/abs/2308.16463v1","updated":"2023-08-31T05:15:27Z","published":"2023-08-31T05:15:27Z","title":"Sparkles: Unlocking Chats Across Multiple Images for Multimodal\n Instruction-Following Models","summary":" Large language models exhibit enhanced zero-shot performance on various tasks\nwhen fine-tuned with instruction-following data. Multimodal\ninstruction-following models extend these capabilities by integrating both text\nand images. However, existing models such as MiniGPT-4 face challenges in\nmaintaining dialogue coherence in scenarios involving multiple images. A\nprimary reason is the lack of a specialized dataset for this critical\napplication. To bridge these gaps, we present SparklesChat, a multimodal\ninstruction-following model for open-ended dialogues across multiple images. To\nsupport the training, we introduce SparklesDialogue, the first\nmachine-generated dialogue dataset tailored for word-level interleaved\nmulti-image and text interactions. Furthermore, we construct SparklesEval, a\nGPT-assisted benchmark for quantitatively assessing a model's conversational\ncompetence across multiple images and dialogue turns. Our experiments validate\nthe effectiveness of SparklesChat in understanding and reasoning across\nmultiple images and dialogue turns. Specifically, SparklesChat outperformed\nMiniGPT-4 on established vision-and-language benchmarks, including the BISON\nbinary image selection task and the NLVR2 visual reasoning task. Moreover,\nSparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding\nMiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative\nevaluations further demonstrate SparklesChat's generality in handling\nreal-world applications. All resources will be available at\nhttps://github.com/HYPJUDY/Sparkles.\n","authors":["Yupan Huang","Zaiqiao Meng","Fangyu Liu","Yixuan Su","Nigel Collier","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16458v1","updated":"2023-08-31T04:52:58Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained language models like ChatGPT have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks. Moreover, in bioinformatics, generating\nfunctional programs poses additional notable challenges due to the amount of\ndomain knowledge, the need for complicated data operations, and intricate\nfunctional dependencies between the operations. Here, we present BioCoder, a\nbenchmark developed to evaluate existing pre-trained models in generating\nbioinformatics code. In relation to function-code generation, BioCoder covers\npotential package dependencies, class declarations, and global variables. It\nincorporates 1026 functions and 1243 methods in Python and Java from GitHub and\n253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing\nframework for evaluation, and we have applied it to evaluate many models\nincluding InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+,\nInstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes\nthe importance of domain knowledge, pragmatic code generation, and contextual\nunderstanding. Our dataset, benchmark, Docker images, and scripts required for\ntesting are all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06345v2","updated":"2023-08-31T03:14:47Z","published":"2023-06-10T05:24:29Z","title":"Improving Non-autoregressive Translation Quality with Pretrained\n Language Model, Embedding Distillation and Upsampling Strategy for CTC","summary":" Non-autoregressive approaches aim to improve the inference speed of\ntranslation models, particularly those that generate output in a one-pass\nforward manner. However, these approaches often suffer from a significant drop\nin translation quality compared to autoregressive models. This paper introduces\na series of innovative techniques to enhance the translation quality of\nNon-Autoregressive Translation (NAT) models while maintaining a substantial\nacceleration in inference speed. We propose fine-tuning Pretrained Multilingual\nLanguage Models (PMLMs) with the CTC loss to train NAT models effectively.\nFurthermore, we adopt the MASK insertion scheme for up-sampling instead of\ntoken duplication, and we present an embedding distillation method to further\nenhance performance. In our experiments, our model outperforms the baseline\nautoregressive model (Transformer \\textit{base}) on multiple datasets,\nincluding WMT'14 DE$\\leftrightarrow$EN, WMT'16 RO$\\leftrightarrow$EN, and\nIWSLT'14 DE$\\leftrightarrow$EN. Notably, our model achieves better performance\nthan the baseline autoregressive model on the IWSLT'14 En$\\leftrightarrow$De\nand WMT'16 En$\\leftrightarrow$Ro datasets, even without using distillation data\nduring training. It is worth highlighting that on the IWSLT'14\nDE$\\rightarrow$EN dataset, our model achieves an impressive BLEU score of\n39.59, setting a new state-of-the-art performance. Additionally, our model\nexhibits a remarkable speed improvement of 16.35 times compared to the\nautoregressive model.\n","authors":["Shen-sian Syu","Juncheng Xie","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2306.06345v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.16415v1","updated":"2023-08-31T02:58:33Z","published":"2023-08-31T02:58:33Z","title":"Knowledge Distillation from Non-streaming to Streaming ASR Encoder using\n Auxiliary Non-streaming Layer","summary":" Streaming automatic speech recognition (ASR) models are restricted from\naccessing future context, which results in worse performance compared to the\nnon-streaming models. To improve the performance of streaming ASR, knowledge\ndistillation (KD) from the non-streaming to streaming model has been studied,\nmainly focusing on aligning the output token probabilities. In this paper, we\npropose a layer-to-layer KD from the teacher encoder to the student encoder. To\nensure that features are extracted using the same context, we insert auxiliary\nnon-streaming branches to the student and perform KD from the non-streaming\nteacher layer to the non-streaming auxiliary layer. We design a special KD loss\nthat leverages the autoregressive predictive coding (APC) mechanism to\nencourage the streaming model to predict unseen future contexts. Experimental\nresults show that the proposed method can significantly reduce the word error\nrate compared to previous token probability distillation methods.\n","authors":["Kyuhong Shim","Jinkyu Lee","Simyung Chang","Kyuwoong Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.16415v1.pdf","comment":"Accepted to Interspeech 2023"},{"id":"http://arxiv.org/abs/2306.15245v2","updated":"2023-08-31T02:50:25Z","published":"2023-06-27T06:58:03Z","title":"C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue\n Evaluation","summary":" Existing reference-free turn-level evaluation metrics for chatbots\ninadequately capture the interaction between the user and the system.\nConsequently, they often correlate poorly with human evaluations. To address\nthis issue, we propose a novel model-agnostic approach that leverages\nConditional Pointwise Mutual Information (C-PMI) to measure the turn-level\ninteraction between the system and the user based on a given evaluation\ndimension. Experimental results on the widely used FED dialogue evaluation\ndataset demonstrate that our approach significantly improves the correlation\nwith human judgment compared with existing evaluation systems. By replacing the\nnegative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve\na relative 60.5% higher Spearman correlation on average for the FED evaluation\nmetric. Our code is publicly available at https://github.com/renll/C-PMI.\n","authors":["Liliang Ren","Mankeerat Sidhu","Qi Zeng","Revanth Gangi Reddy","Heng Ji","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2306.15245v2.pdf","comment":"Published at ACL2023 DialDoc Workshop; Updated Results"},{"id":"http://arxiv.org/abs/2308.15906v2","updated":"2023-08-31T22:49:03Z","published":"2023-08-30T09:19:06Z","title":"Is the U.S. Legal System Ready for AI's Challenges to Human Values?","summary":" Our interdisciplinary study investigates how effectively U.S. laws confront\nthe challenges posed by Generative AI to human values. Through an analysis of\ndiverse hypothetical scenarios crafted during an expert workshop, we have\nidentified notable gaps and uncertainties within the existing legal framework\nregarding the protection of fundamental values, such as privacy, autonomy,\ndignity, diversity, equity, and physical/mental well-being. Constitutional and\ncivil rights, it appears, may not provide sufficient protection against\nAI-generated discriminatory outputs. Furthermore, even if we exclude the\nliability shield provided by Section 230, proving causation for defamation and\nproduct liability claims is a challenging endeavor due to the intricate and\nopaque nature of AI systems. To address the unique and unforeseeable threats\nposed by Generative AI, we advocate for legal frameworks that evolve to\nrecognize new threat and provide proactive, auditable guidelines to industry\nstakeholders. Addressing these issues requires deep interdisciplinary\ncollaborations to identify harms, values, and mitigation strategies.\n","authors":["Inyoung Cheong","Aylin Caliskan","Tadayoshi Kohno"],"pdf_url":"https://arxiv.org/pdf/2308.15906v2.pdf","comment":"26 pages, 7 figures"},{"id":"http://arxiv.org/abs/2306.11300v2","updated":"2023-08-31T22:33:54Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Foundation Models utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. In this paper, we propose a\nnew framework that includes the Domain Foundation Model (DFM), bridging the gap\nbetween the General Foundation Model (GFM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\ntried several Parameter-Efficient Fine-Tuning methods on RS5M to implement the\nDFM. Experimental results show that our proposed dataset are highly effective\nfor various tasks, improving upon the baseline by $8 \\% \\sim 16 \\%$ in\nzero-shot classification tasks, and obtaining good results in both\nVision-Language Retrieval and Semantic Localization tasks.\n\\url{https://github.com/om-ai-lab/RS5M}\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v2.pdf","comment":"RS5M dataset v4"},{"id":"http://arxiv.org/abs/2309.00155v1","updated":"2023-08-31T22:05:46Z","published":"2023-08-31T22:05:46Z","title":"LLM in the Shell: Generative Honeypots","summary":" Honeypots are essential tools in cybersecurity. However, most of them (even\nthe high-interaction ones) lack the required realism to engage and fool human\nattackers. This limitation makes them easily discernible, hindering their\neffectiveness. This work introduces a novel method to create dynamic and\nrealistic software honeypots based on Large Language Models. Preliminary\nresults indicate that LLMs can create credible and dynamic honeypots capable of\naddressing important limitations of previous honeypots, such as deterministic\nresponses, lack of adaptability, etc. We evaluated the realism of each command\nby conducting an experiment with human attackers who needed to say if the\nanswer from the honeypot was fake or not. Our proposed honeypot, called shelLM,\nreached an accuracy rate of 0.92.\n","authors":["Muris Sladić","Veronica Valeros","Carlos Catania","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2309.00155v1.pdf","comment":"5 pages. 1 figure 1 table"},{"id":"http://arxiv.org/abs/2309.00135v1","updated":"2023-08-31T21:15:06Z","published":"2023-08-31T21:15:06Z","title":"Construction Grammar and Artificial Intelligence","summary":" In this chapter, we argue that it is highly beneficial for the contemporary\nconstruction grammarian to have a thorough understanding of the strong\nrelationship between the research fields of construction grammar and artificial\nintelligence. We start by unravelling the historical links between the two\nfields, showing that their relationship is rooted in a common attitude towards\nhuman communication and language. We then discuss the first direction of\ninfluence, focussing in particular on how insights and techniques from the\nfield of artificial intelligence play an important role in operationalising,\nvalidating and scaling constructionist approaches to language. We then proceed\nto the second direction of influence, highlighting the relevance of\nconstruction grammar insights and analyses to the artificial intelligence\nendeavour of building truly intelligent agents. We support our case with a\nvariety of illustrative examples and conclude that the further elaboration of\nthis relationship will play a key role in shaping the future of the field of\nconstruction grammar.\n","authors":["Katrien Beuls","Paul Van Eecke"],"pdf_url":"https://arxiv.org/pdf/2309.00135v1.pdf","comment":"Peer-reviewed author's draft of a chapter to appear in the Cambridge\n Handbook of Construction Grammar (2024 - edited by Mirjam Fried and Kiki\n Nikiforidou)"},{"id":"http://arxiv.org/abs/2309.00126v1","updated":"2023-08-31T20:25:44Z","published":"2023-08-31T20:25:44Z","title":"QS-TTS: Towards Semi-Supervised Text-to-Speech Synthesis via\n Vector-Quantized Self-Supervised Speech Representation Learning","summary":" This paper proposes a novel semi-supervised TTS framework, QS-TTS, to improve\nTTS quality with lower supervised data requirements via Vector-Quantized\nSelf-Supervised Speech Representation Learning (VQ-S3RL) utilizing more\nunlabeled speech audio. This framework comprises two VQ-S3R learners: first,\nthe principal learner aims to provide a generative Multi-Stage Multi-Codebook\n(MSMC) VQ-S3R via the MSMC-VQ-GAN combined with the contrastive S3RL, while\ndecoding it back to the high-quality audio; then, the associate learner further\nabstracts the MSMC representation into a highly-compact VQ representation\nthrough a VQ-VAE. These two generative VQ-S3R learners provide profitable\nspeech representations and pre-trained models for TTS, significantly improving\nsynthesis quality with the lower requirement for supervised data. QS-TTS is\nevaluated comprehensively under various scenarios via subjective and objective\ntests in experiments. The results powerfully demonstrate the superior\nperformance of QS-TTS, winning the highest MOS over supervised or\nsemi-supervised baseline TTS approaches, especially in low-resource scenarios.\nMoreover, comparing various speech representations and transfer learning\nmethods in TTS further validates the notable improvement of the proposed\nVQ-S3RL to TTS, showing the best audio quality and intelligibility metrics. The\ntrend of slower decay in the synthesis quality of QS-TTS with decreasing\nsupervised data further highlights its lower requirements for supervised data,\nindicating its great potential in low-resource scenarios.\n","authors":["Haohan Guo","Fenglong Xie","Jiawen Kang","Yujia Xiao","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2309.00126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09162v3","updated":"2023-08-31T20:02:47Z","published":"2023-07-18T11:38:45Z","title":"Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and\n Addressing Sociological Implications","summary":" Gender bias in artificial intelligence (AI) and natural language processing\nhas garnered significant attention due to its potential impact on societal\nperceptions and biases. This research paper aims to analyze gender bias in\nLarge Language Models (LLMs) with a focus on multiple comparisons between GPT-2\nand GPT-3.5, some prominent language models, to better understand its\nimplications. Through a comprehensive literature review, the study examines\nexisting research on gender bias in AI language models and identifies gaps in\nthe current knowledge. The methodology involves collecting and preprocessing\ndata from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis\ntechniques to evaluate gender bias in the generated text. The findings shed\nlight on gendered word associations, language usage, and biased narratives\npresent in the outputs of these Large Language Models. The discussion explores\nthe ethical implications of gender bias and its potential consequences on\nsocial perceptions and marginalized communities. Additionally, the paper\npresents strategies for reducing gender bias in LLMs, including algorithmic\napproaches and data augmentation techniques. The research highlights the\nimportance of interdisciplinary collaborations and the role of sociological\nstudies in mitigating gender bias in AI models. By addressing these issues, we\ncan pave the way for more inclusive and unbiased AI systems that have a\npositive impact on society.\n","authors":["Vishesh Thakur"],"pdf_url":"https://arxiv.org/pdf/2307.09162v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07851v4","updated":"2023-08-31T19:47:14Z","published":"2023-07-15T17:01:56Z","title":"AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual\n Similarity Using Contrastive Learning and Structured Knowledge","summary":" Generic sentence embeddings provide a coarse-grained approximation of\nsemantic textual similarity but ignore specific aspects that make texts\nsimilar. Conversely, aspect-based sentence embeddings provide similarities\nbetween texts based on certain predefined aspects. Thus, similarity predictions\nof texts are more targeted to specific requirements and more easily\nexplainable. In this paper, we present AspectCSE, an approach for aspect-based\ncontrastive learning of sentence embeddings. Results indicate that AspectCSE\nachieves an average improvement of 3.97% on information retrieval tasks across\nmultiple aspects compared to the previous best results. We also propose using\nWikidata knowledge graph properties to train models of multi-aspect sentence\nembeddings in which multiple specific aspects are simultaneously considered\nduring similarity predictions. We demonstrate that multi-aspect embeddings\noutperform single-aspect embeddings on aspect-specific information retrieval\ntasks. Finally, we examine the aspect-based sentence embedding space and\ndemonstrate that embeddings of semantically similar aspect labels are often\nclose, even without explicit similarity training between different aspect\nlabels.\n","authors":["Tim Schopf","Emanuel Gerber","Malte Ostendorff","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.07851v4.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2309.00087v1","updated":"2023-08-31T19:06:39Z","published":"2023-08-31T19:06:39Z","title":"Large language models in medicine: the potentials and pitfalls","summary":" Large language models (LLMs) have been applied to tasks in healthcare,\nranging from medical exam questions to responding to patient questions. With\nincreasing institutional partnerships between companies producing LLMs and\nhealthcare systems, real world clinical application is coming closer to\nreality. As these models gain traction, it is essential for healthcare\npractitioners to understand what LLMs are, their development, their current and\npotential applications, and the associated pitfalls when utilized in medicine.\nThis review and accompanying tutorial aim to give an overview of these topics\nto aid healthcare practitioners in understanding the rapidly changing landscape\nof LLMs as applied to medicine.\n","authors":["Jesutofunmi A. Omiye","Haiwen Gui","Shawheen J. Rezaei","James Zou","Roxana Daneshjou"],"pdf_url":"https://arxiv.org/pdf/2309.00087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00071v1","updated":"2023-08-31T18:18:07Z","published":"2023-08-31T18:18:07Z","title":"YaRN: Efficient Context Window Extension of Large Language Models","summary":" Rotary Position Embeddings (RoPE) have been shown to effectively encode\npositional information in transformer-based language models. However, these\nmodels fail to generalize past the sequence length they were trained on. We\npresent YaRN (Yet another RoPE extensioN method), a compute-efficient method to\nextend the context window of such models, requiring 10x less tokens and 2.5x\nless training steps than previous methods. Using YaRN, we show that LLaMA\nmodels can effectively utilize and extrapolate to context lengths much longer\nthan their original pre-training would allow, while also surpassing previous\nthe state-of-the-art at context window extension. In addition, we demonstrate\nthat YaRN exhibits the capability to extrapolate beyond the limited context of\na fine-tuning dataset. We publish the checkpoints of Llama 2 7B/13B fine-tuned\nusing YaRN with 64k and 128k context windows at\nhttps://github.com/jquesnelle/yarn\n","authors":["Bowen Peng","Jeffrey Quesnelle","Honglu Fan","Enrico Shippole"],"pdf_url":"https://arxiv.org/pdf/2309.00071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11696v2","updated":"2023-08-31T18:18:03Z","published":"2023-08-22T17:59:30Z","title":"Efficient Benchmarking (of Language Models)","summary":" The increasing versatility of language models LMs has given rise to a new\nclass of benchmarks that comprehensively assess a broad range of capabilities.\nSuch benchmarks are associated with massive computational costs reaching\nthousands of GPU hours per model. However the efficiency aspect of these\nevaluation efforts had raised little discussion in the literature. In this work\nwe present the problem of Efficient Benchmarking namely intelligently reducing\nthe computation costs of LM evaluation without compromising reliability. Using\nthe HELM benchmark as a test case we investigate how different benchmark design\nchoices affect the computation-reliability tradeoff. We propose to evaluate the\nreliability of such decisions by using a new measure Decision Impact on\nReliability DIoR for short. We find for example that the current leader on HELM\nmay change by merely removing a low-ranked model from the benchmark and observe\nthat a handful of examples suffice to obtain the correct benchmark ranking.\nConversely a slightly different choice of HELM scenarios varies ranking widely.\nBased on our findings we outline a set of concrete recommendations for more\nefficient benchmark design and utilization practices leading to dramatic cost\nsavings with minimal loss of benchmark reliability often reducing computation\nby x100 or more.\n","authors":["Yotam Perlitz","Elron Bandel","Ariel Gera","Ofir Arviv","Liat Ein-Dor","Eyal Shnarch","Noam Slonim","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2308.11696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13301v2","updated":"2023-08-31T15:24:36Z","published":"2023-04-26T06:02:01Z","title":"Prompting GPT-3.5 for Text-to-SQL with De-semanticization and Skeleton\n Retrieval","summary":" Text-to-SQL is a task that converts a natural language question into a\nstructured query language (SQL) to retrieve information from a database. Large\nlanguage models (LLMs) work well in natural language generation tasks, but they\nare not specifically pre-trained to understand the syntax and semantics of SQL\ncommands. In this paper, we propose an LLM-based framework for Text-to-SQL\nwhich retrieves helpful demonstration examples to prompt LLMs. However,\nquestions with different database schemes can vary widely, even if the\nintentions behind them are similar and the corresponding SQL queries exhibit\nsimilarities. Consequently, it becomes crucial to identify the appropriate SQL\ndemonstrations that align with our requirements. We design a de-semanticization\nmechanism that extracts question skeletons, allowing us to retrieve similar\nexamples based on their structural similarity. We also model the relationships\nbetween question tokens and database schema items (i.e., tables and columns) to\nfilter out scheme-related information. Our framework adapts the range of the\ndatabase schema in prompts to balance length and valuable information. A\nfallback mechanism allows for a more detailed schema to be provided if the\ngenerated SQL query fails. Ours outperforms state-of-the-art models and\ndemonstrates strong generalization ability on three cross-domain Text-to-SQL\nbenchmarks.\n","authors":["Chunxi Guo","Zhiliang Tian","Jintao Tang","Pancheng Wang","Zhihua Wen","Kang Yang","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2304.13301v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.16911v1","updated":"2023-08-31T17:59:46Z","published":"2023-08-31T17:59:46Z","title":"PointLLM: Empowering Large Language Models to Understand Point Clouds","summary":" The unprecedented advancements in Large Language Models (LLMs) have created a\nprofound impact on natural language processing but are yet to fully embrace the\nrealm of 3D understanding. This paper introduces PointLLM, a preliminary effort\nto fill this gap, thereby enabling LLMs to understand point clouds and offering\na new avenue beyond 2D visual data. PointLLM processes colored object point\nclouds with human instructions and generates contextually appropriate\nresponses, illustrating its grasp of point clouds and common sense.\nSpecifically, it leverages a point cloud encoder with a powerful LLM to\neffectively fuse geometric, appearance, and linguistic information. We collect\na novel dataset comprising 660K simple and 70K complex point-text instruction\npairs to enable a two-stage training strategy: initially aligning latent spaces\nand subsequently instruction-tuning the unified model. To rigorously evaluate\nour model's perceptual abilities and its generalization capabilities, we\nestablish two benchmarks: Generative 3D Object Classification and 3D Object\nCaptioning, assessed through three different methods, including human\nevaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment\nresults show that PointLLM demonstrates superior performance over existing 2D\nbaselines. Remarkably, in human-evaluated object captioning tasks, PointLLM\noutperforms human annotators in over 50% of the samples. Codes, datasets, and\nbenchmarks are available at https://github.com/OpenRobotLab/PointLLM .\n","authors":["Runsen Xu","Xiaolong Wang","Tai Wang","Yilun Chen","Jiangmiao Pang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.16911v1.pdf","comment":"19 pages. Empowering large language models with 3D point cloud\n understanding, accompanied by a novel dataset and carefully designed\n benchmarks. Project page: https://runsenxu.com/projects/PointLLM"},{"id":"http://arxiv.org/abs/2308.16909v1","updated":"2023-08-31T17:59:33Z","published":"2023-08-31T17:59:33Z","title":"StyleInV: A Temporal Style Modulated Inversion Network for Unconditional\n Video Generation","summary":" Unconditional video generation is a challenging task that involves\nsynthesizing high-quality videos that are both coherent and of extended\nduration. To address this challenge, researchers have used pretrained StyleGAN\nimage generators for high-quality frame synthesis and focused on motion\ngenerator design. The motion generator is trained in an autoregressive manner\nusing heavy 3D convolutional discriminators to ensure motion coherence during\nvideo generation. In this paper, we introduce a novel motion generator design\nthat uses a learning-based inversion network for GAN. The encoder in our method\ncaptures rich and smooth priors from encoding images to latents, and given the\nlatent of an initially generated frame as guidance, our method can generate\nsmooth future latent by modulating the inversion encoder temporally. Our method\nenjoys the advantage of sparse training and naturally constrains the generation\nspace of our motion generator with the inversion network guided by the initial\nframe, eliminating the need for heavy discriminators. Moreover, our method\nsupports style transfer with simple fine-tuning when the encoder is paired with\na pretrained StyleGAN generator. Extensive experiments conducted on various\nbenchmarks demonstrate the superiority of our method in generating long and\nhigh-resolution videos with decent single-frame quality and temporal\nconsistency.\n","authors":["Yuhan Wang","Liming Jiang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2308.16909v1.pdf","comment":"ICCV 2023. Code: https://github.com/johannwyh/StyleInV Project page:\n https://www.mmlab-ntu.com/project/styleinv/index.html"},{"id":"http://arxiv.org/abs/2308.16906v1","updated":"2023-08-31T17:59:24Z","published":"2023-08-31T17:59:24Z","title":"Fine-Grained Cross-View Geo-Localization Using a Correlation-Aware\n Homography Estimator","summary":" In this paper, we introduce a novel approach to fine-grained cross-view\ngeo-localization. Our method aligns a warped ground image with a corresponding\nGPS-tagged satellite image covering the same area using homography estimation.\nWe first employ a differentiable spherical transform, adhering to geometric\nprinciples, to accurately align the perspective of the ground image with the\nsatellite map. This transformation effectively places ground and aerial images\nin the same view and on the same plane, reducing the task to an image alignment\nproblem. To address challenges such as occlusion, small overlapping range, and\nseasonal variations, we propose a robust correlation-aware homography estimator\nto align similar parts of the transformed ground image with the satellite\nimage. Our method achieves sub-pixel resolution and meter-level GPS accuracy by\nmapping the center point of the transformed ground image to the satellite image\nusing a homography matrix and determining the orientation of the ground camera\nusing a point above the central axis. Operating at a speed of 30 FPS, our\nmethod outperforms state-of-the-art techniques, reducing the mean metric\nlocalization error by 21.3% and 32.4% in same-area and cross-area\ngeneralization tasks on the VIGOR benchmark, respectively, and by 34.4% on the\nKITTI benchmark in same-area evaluation.\n","authors":["Xiaolong Wang","Runsen Xu","Zuofan Cui","Zeyu Wan","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16906v1.pdf","comment":"19 pages. Reducing the cross-view geo-localization problem to a 2D\n image alignment problem by utilizing BEV transformation, and completing the\n alignment process with a correlation-aware homography estimator. Code:\n https://github.com/xlwangDev/HC-Net"},{"id":"http://arxiv.org/abs/2308.16905v1","updated":"2023-08-31T17:59:08Z","published":"2023-08-31T17:59:08Z","title":"InterDiff: Generating 3D Human-Object Interactions with Physics-Informed\n Diffusion","summary":" This paper addresses a novel task of anticipating 3D human-object\ninteractions (HOIs). Most existing research on HOI synthesis lacks\ncomprehensive whole-body interactions with dynamic objects, e.g., often limited\nto manipulating small or static objects. Our task is significantly more\nchallenging, as it requires modeling dynamic objects with various shapes,\ncapturing whole-body motion, and ensuring physically valid interactions. To\nthis end, we propose InterDiff, a framework comprising two key steps: (i)\ninteraction diffusion, where we leverage a diffusion model to encode the\ndistribution of future human-object interactions; (ii) interaction correction,\nwhere we introduce a physics-informed predictor to correct denoised HOIs in a\ndiffusion step. Our key insight is to inject prior knowledge that the\ninteractions under reference with respect to contact points follow a simple\npattern and are easily predictable. Experiments on multiple human-object\ninteraction datasets demonstrate the effectiveness of our method for this task,\ncapable of producing realistic, vivid, and remarkably long-term 3D HOI\npredictions.\n","authors":["Sirui Xu","Zhengyuan Li","Yu-Xiong Wang","Liang-Yan Gui"],"pdf_url":"https://arxiv.org/pdf/2308.16905v1.pdf","comment":"ICCV 2023; Project Page: https://sirui-xu.github.io/InterDiff/"},{"id":"http://arxiv.org/abs/2303.12059v3","updated":"2023-08-31T17:58:46Z","published":"2023-03-21T17:51:23Z","title":"Motion Matters: Neural Motion Transfer for Better Camera Physiological\n Measurement","summary":" Machine learning models for camera-based physiological measurement can have\nweak generalization due to a lack of representative training data. Body motion\nis one of the most significant sources of noise when attempting to recover the\nsubtle cardiac pulse from a video. We explore motion transfer as a form of data\naugmentation to introduce motion variation while preserving physiological\nchanges of interest. We adapt a neural video synthesis approach to augment\nvideos for the task of remote photoplethysmography (rPPG) and study the effects\nof motion augmentation with respect to 1) the magnitude and 2) the type of\nmotion. After training on motion-augmented versions of publicly available\ndatasets, we demonstrate a 47% improvement over existing inter-dataset results\nusing various state-of-the-art methods on the PURE dataset. We also present\ninter-dataset results on five benchmark datasets to show improvements of up to\n79% using TS-CAN, a neural rPPG estimation method. Our findings illustrate the\nusefulness of motion transfer as a data augmentation technique for improving\nthe generalization of models for camera-based physiological sensing. We release\nour code for using motion transfer as a data augmentation technique on three\npublicly available datasets, UBFC-rPPG, PURE, and SCAMPS, and models\npre-trained on motion-augmented data here: https://motion-matters.github.io/\n","authors":["Akshay Paruchuri","Xin Liu","Yulu Pan","Shwetak Patel","Daniel McDuff","Soumyadip Sengupta"],"pdf_url":"https://arxiv.org/pdf/2303.12059v3.pdf","comment":"17 pages, 6 figures, 15 tables"},{"id":"http://arxiv.org/abs/2308.16896v1","updated":"2023-08-31T17:57:17Z","published":"2023-08-31T17:57:17Z","title":"PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic\n Occupancy Prediction","summary":" Semantic segmentation in autonomous driving has been undergoing an evolution\nfrom sparse point segmentation to dense voxel segmentation, where the objective\nis to predict the semantic occupancy of each voxel in the concerned 3D space.\nThe dense nature of the prediction space has rendered existing efficient\n2D-projection-based methods (e.g., bird's eye view, range view, etc.)\nineffective, as they can only describe a subspace of the 3D scene. To address\nthis, we propose a cylindrical tri-perspective view to represent point clouds\neffectively and comprehensively and a PointOcc model to process them\nefficiently. Considering the distance distribution of LiDAR point clouds, we\nconstruct the tri-perspective view in the cylindrical coordinate system for\nmore fine-grained modeling of nearer areas. We employ spatial group pooling to\nmaintain structural details during projection and adopt 2D backbones to\nefficiently process each TPV plane. Finally, we obtain the features of each\npoint by aggregating its projected features on each of the processed TPV planes\nwithout the need for any post-processing. Extensive experiments on both 3D\noccupancy prediction and LiDAR segmentation benchmarks demonstrate that the\nproposed PointOcc achieves state-of-the-art performance with much faster speed.\nSpecifically, despite only using LiDAR, PointOcc significantly outperforms all\nother methods, including multi-modal methods, with a large margin on the\nOpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc.\n","authors":["Sicheng Zuo","Wenzhao Zheng","Yuanhui Huang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16896v1.pdf","comment":"Code is available at https://github.com/wzzheng/PointOcc"},{"id":"http://arxiv.org/abs/2308.16894v1","updated":"2023-08-31T17:56:19Z","published":"2023-08-31T17:56:19Z","title":"EMDB: The Electromagnetic Database of Global 3D Human Pose and Shape in\n the Wild","summary":" We present EMDB, the Electromagnetic Database of Global 3D Human Pose and\nShape in the Wild. EMDB is a novel dataset that contains high-quality 3D SMPL\npose and shape parameters with global body and camera trajectories for\nin-the-wild videos. We use body-worn, wireless electromagnetic (EM) sensors and\na hand-held iPhone to record a total of 58 minutes of motion data, distributed\nover 81 indoor and outdoor sequences and 10 participants. Together with\naccurate body poses and shapes, we also provide global camera poses and body\nroot trajectories. To construct EMDB, we propose a multi-stage optimization\nprocedure, which first fits SMPL to the 6-DoF EM measurements and then refines\nthe poses via image observations. To achieve high-quality results, we leverage\na neural implicit avatar model to reconstruct detailed human surface geometry\nand appearance, which allows for improved alignment and smoothness via a dense\npixel-level objective. Our evaluations, conducted with a multi-view volumetric\ncapture system, indicate that EMDB has an expected accuracy of 2.3 cm\npositional and 10.6 degrees angular error, surpassing the accuracy of previous\nin-the-wild datasets. We evaluate existing state-of-the-art monocular RGB\nmethods for camera-relative and global pose estimation on EMDB. EMDB is\npublicly available under https://ait.ethz.ch/emdb\n","authors":["Manuel Kaufmann","Jie Song","Chen Guo","Kaiyue Shen","Tianjian Jiang","Chengcheng Tang","Juan Zarate","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2308.16894v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16893v1","updated":"2023-08-31T17:56:13Z","published":"2023-08-31T17:56:13Z","title":"Language-Conditioned Path Planning","summary":" Contact is at the core of robotic manipulation. At times, it is desired (e.g.\nmanipulation and grasping), and at times, it is harmful (e.g. when avoiding\nobstacles). However, traditional path planning algorithms focus solely on\ncollision-free paths, limiting their applicability in contact-rich tasks. To\naddress this limitation, we propose the domain of Language-Conditioned Path\nPlanning, where contact-awareness is incorporated into the path planning\nproblem. As a first step in this domain, we propose Language-Conditioned\nCollision Functions (LACO) a novel approach that learns a collision function\nusing only a single-view image, language prompt, and robot configuration. LACO\npredicts collisions between the robot and the environment, enabling flexible,\nconditional path planning without the need for manual object annotations, point\ncloud data, or ground-truth object meshes. In both simulation and the real\nworld, we demonstrate that LACO can facilitate complex, nuanced path plans that\nallow for interaction with objects that are safe to collide, rather than\nprohibiting any collision.\n","authors":["Amber Xie","Youngwoon Lee","Pieter Abbeel","Stephen James"],"pdf_url":"https://arxiv.org/pdf/2308.16893v1.pdf","comment":"Conference on Robot Learning, 2023"},{"id":"http://arxiv.org/abs/2308.16891v1","updated":"2023-08-31T17:52:10Z","published":"2023-08-31T17:52:10Z","title":"GNFactor: Multi-Task Real Robot Learning with Generalizable Neural\n Feature Fields","summary":" It is a long-standing problem in robotics to develop agents capable of\nexecuting diverse manipulation tasks from visual observations in unstructured\nreal-world environments. To achieve this goal, the robot needs to have a\ncomprehensive understanding of the 3D structure and semantics of the scene. In\nthis work, we present $\\textbf{GNFactor}$, a visual behavior cloning agent for\nmulti-task robotic manipulation with $\\textbf{G}$eneralizable $\\textbf{N}$eural\nfeature $\\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural\nfield (GNF) as a reconstruction module and a Perceiver Transformer as a\ndecision-making module, leveraging a shared deep 3D voxel representation. To\nincorporate semantics in 3D, the reconstruction module utilizes a\nvision-language foundation model ($\\textit{e.g.}$, Stable Diffusion) to distill\nrich semantic information into the deep 3D voxel. We evaluate GNFactor on 3\nreal robot tasks and perform detailed ablations on 10 RLBench tasks with a\nlimited number of demonstrations. We observe a substantial improvement of\nGNFactor over current state-of-the-art methods in seen and unseen tasks,\ndemonstrating the strong generalization ability of GNFactor. Our project\nwebsite is https://yanjieze.com/GNFactor/ .\n","authors":["Yanjie Ze","Ge Yan","Yueh-Hua Wu","Annabella Macaluso","Yuying Ge","Jianglong Ye","Nicklas Hansen","Li Erran Li","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16891v1.pdf","comment":"CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/"},{"id":"http://arxiv.org/abs/2308.16890v1","updated":"2023-08-31T17:52:04Z","published":"2023-08-31T17:52:04Z","title":"TouchStone: Evaluating Vision-Language Models by Language Models","summary":" Large vision-language models (LVLMs) have recently witnessed rapid\nadvancements, exhibiting a remarkable capacity for perceiving, understanding,\nand processing visual information by connecting visual receptor with large\nlanguage models (LLMs). However, current assessments mainly focus on\nrecognizing and reasoning abilities, lacking direct evaluation of\nconversational skills and neglecting visual storytelling abilities. In this\npaper, we propose an evaluation method that uses strong LLMs as judges to\ncomprehensively evaluate the various abilities of LVLMs. Firstly, we construct\na comprehensive visual dialogue dataset TouchStone, consisting of open-world\nimages and questions, covering five major categories of abilities and 27\nsubtasks. This dataset not only covers fundamental recognition and\ncomprehension but also extends to literary creation. Secondly, by integrating\ndetailed image annotations we effectively transform the multimodal input\ncontent into a form understandable by LLMs. This enables us to employ advanced\nLLMs for directly evaluating the quality of the multimodal dialogue without\nrequiring human intervention. Through validation, we demonstrate that powerful\nLVLMs, such as GPT-4, can effectively score dialogue quality by leveraging\ntheir textual capabilities alone, aligning with human preferences. We hope our\nwork can serve as a touchstone for LVLMs' evaluation and pave the way for\nbuilding stronger LVLMs. The evaluation code is available at\nhttps://github.com/OFA-Sys/TouchStone.\n","authors":["Shuai Bai","Shusheng Yang","Jinze Bai","Peng Wang","Xingxuan Zhang","Junyang Lin","Xinggang Wang","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16890v1.pdf","comment":"https://github.com/OFA-Sys/TouchStone"},{"id":"http://arxiv.org/abs/2212.02611v2","updated":"2023-08-31T17:51:08Z","published":"2022-12-05T21:52:12Z","title":"StyleGAN as a Utility-Preserving Face De-identification Method","summary":" Face de-identification methods have been proposed to preserve users' privacy\nby obscuring their faces. These methods, however, can degrade the quality of\nphotos, and they usually do not preserve the utility of faces, i.e., their age,\ngender, pose, and facial expression. Recently, GANs, such as StyleGAN, have\nbeen proposed, which generate realistic, high-quality imaginary faces. In this\npaper, we investigate the use of StyleGAN in generating de-identified faces\nthrough style mixing. We examined this de-identification method for preserving\nutility and privacy by implementing several face detection, verification, and\nidentification attacks and conducting a user study. The results from our\nextensive experiments, human evaluation, and comparison with two\nstate-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN\nperforms on par or better than these methods, preserving users' privacy and\nimages' utility. In particular, the results of the machine learning-based\nexperiments show that StyleGAN0-4 preserves utility better than CIAGAN and\nDeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves\nutility at the same level while providing more privacy. In this paper, for the\nfirst time, we also performed a carefully designed user study to examine both\nprivacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well\nas CIAGAN and DeepPrivacy from the human observers' perspectives. Our\nstatistical tests showed that participants tend to verify and identify\nStyleGAN0-5 images more easily than DeepPrivacy images. All the methods but\nStyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding\nutility, as expected, StyleGAN0-5 performed significantly better in preserving\nsome attributes. Among all methods, on average, participants believe gender has\nbeen preserved the most while naturalness has been preserved the least.\n","authors":["Seyyed Mohammad Sadegh Moosavi Khorzooghi","Shirin Nilizadeh"],"pdf_url":"https://arxiv.org/pdf/2212.02611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16880v1","updated":"2023-08-31T17:37:23Z","published":"2023-08-31T17:37:23Z","title":"Text2Scene: Text-driven Indoor Scene Stylization with Part-aware Details","summary":" We propose Text2Scene, a method to automatically create realistic textures\nfor virtual scenes composed of multiple objects. Guided by a reference image\nand text descriptions, our pipeline adds detailed texture on labeled 3D\ngeometries in the room such that the generated colors respect the hierarchical\nstructure or semantic parts that are often composed of similar materials.\nInstead of applying flat stylization on the entire scene at a single step, we\nobtain weak semantic cues from geometric segmentation, which are further\nclarified by assigning initial colors to segmented parts. Then we add texture\ndetails for individual objects such that their projections on image space\nexhibit feature embedding aligned with the embedding of the input. The\ndecomposition makes the entire pipeline tractable to a moderate amount of\ncomputation resources and memory. As our framework utilizes the existing\nresources of image and text embedding, it does not require dedicated datasets\nwith high-quality textures designed by skillful artists. To the best of our\nknowledge, it is the first practical and scalable approach that can create\ndetailed and realistic textures of the desired style that maintain structural\ncontext for scenes with multiple objects.\n","authors":["Inwoo Hwang","Hyeonwoo Kim","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2308.16880v1.pdf","comment":"Accepted to CVPR 2023"},{"id":"http://arxiv.org/abs/2308.16876v1","updated":"2023-08-31T17:23:50Z","published":"2023-08-31T17:23:50Z","title":"SportsSloMo: A New Benchmark and Baselines for Human-centric Video Frame\n Interpolation","summary":" Human-centric video frame interpolation has great potential for improving\npeople's entertainment experiences and finding commercial applications in the\nsports analysis industry, e.g., synthesizing slow-motion videos. Although there\nare multiple benchmark datasets available in the community, none of them is\ndedicated for human-centric scenarios. To bridge this gap, we introduce\nSportsSloMo, a benchmark consisting of more than 130K video clips and 1M video\nframes of high-resolution ($\\geq$720p) slow-motion sports videos crawled from\nYouTube. We re-train several state-of-the-art methods on our benchmark, and the\nresults show a decrease in their accuracy compared to other datasets. It\nhighlights the difficulty of our benchmark and suggests that it poses\nsignificant challenges even for the best-performing methods, as human bodies\nare highly deformable and occlusions are frequent in sports videos. To improve\nthe accuracy, we introduce two loss terms considering the human-aware priors,\nwhere we add auxiliary supervision to panoptic segmentation and human keypoints\ndetection, respectively. The loss terms are model agnostic and can be easily\nplugged into any video frame interpolation approaches. Experimental results\nvalidate the effectiveness of our proposed loss terms, leading to consistent\nperformance improvement over 5 existing models, which establish strong baseline\nmodels on our benchmark. The dataset and code can be found at:\nhttps://neu-vi.github.io/SportsSlomo/.\n","authors":["Jiaben Chen","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.16876v1.pdf","comment":"Project Page: https://neu-vi.github.io/SportsSlomo/"},{"id":"http://arxiv.org/abs/2308.16875v1","updated":"2023-08-31T17:22:18Z","published":"2023-08-31T17:22:18Z","title":"Holistic Processing of Colour Images Using Novel Quaternion-Valued\n Wavelets on the Plane","summary":" We investigate the applicability of quaternion-valued wavelets on the plane\nto holistic colour image processing. We present a methodology for decomposing\nand reconstructing colour images using quaternionic wavelet filters associated\nto recently developed quaternion-valued wavelets on the plane. We consider\ncompression, enhancement, segmentation, and denoising techniques to demonstrate\nquaternion-valued wavelets as a promising tool for holistic colour image\nprocessing.\n","authors":["Neil D. Dizon","Jeffrey A. Hogan"],"pdf_url":"https://arxiv.org/pdf/2308.16875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16863v1","updated":"2023-08-31T17:05:14Z","published":"2023-08-31T17:05:14Z","title":"Self-pruning Graph Neural Network for Predicting Inflammatory Disease\n Activity in Multiple Sclerosis from Brain MR Images","summary":" Multiple Sclerosis (MS) is a severe neurological disease characterized by\ninflammatory lesions in the central nervous system. Hence, predicting\ninflammatory disease activity is crucial for disease assessment and treatment.\nHowever, MS lesions can occur throughout the brain and vary in shape, size and\ntotal count among patients. The high variance in lesion load and locations\nmakes it challenging for machine learning methods to learn a globally effective\nrepresentation of whole-brain MRI scans to assess and predict disease.\nTechnically it is non-trivial to incorporate essential biomarkers such as\nlesion load or spatial proximity. Our work represents the first attempt to\nutilize graph neural networks (GNN) to aggregate these biomarkers for a novel\nglobal representation. We propose a two-stage MS inflammatory disease activity\nprediction approach. First, a 3D segmentation network detects lesions, and a\nself-supervised algorithm extracts their image features. Second, the detected\nlesions are used to build a patient graph. The lesions act as nodes in the\ngraph and are initialized with image features extracted in the first stage.\nFinally, the lesions are connected based on their spatial proximity and the\ninflammatory disease activity prediction is formulated as a graph\nclassification task. Furthermore, we propose a self-pruning strategy to\nauto-select the most critical lesions for prediction. Our proposed method\noutperforms the existing baseline by a large margin (AUCs of 0.67 vs. 0.61 and\n0.66 vs. 0.60 for one-year and two-year inflammatory disease activity,\nrespectively). Finally, our proposed method enjoys inherent explainability by\nassigning an importance score to each lesion for the overall prediction. Code\nis available at https://github.com/chinmay5/ms_ida.git\n","authors":["Chinmay Prabhakar","Hongwei Bran Li","Johannes C. Paetzold","Timo Loehr","Chen Niu","Mark Mühlau","Daniel Rueckert","Benedikt Wiestler","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2308.16863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.20091v3","updated":"2023-08-31T16:45:40Z","published":"2023-05-31T17:59:52Z","title":"Humans in 4D: Reconstructing and Tracking Humans with Transformers","summary":" We present an approach to reconstruct humans and track them over time. At the\ncore of our approach, we propose a fully \"transformerized\" version of a network\nfor human mesh recovery. This network, HMR 2.0, advances the state of the art\nand shows the capability to analyze unusual poses that have in the past been\ndifficult to reconstruct from single images. To analyze video, we use 3D\nreconstructions from HMR 2.0 as input to a tracking system that operates in 3D.\nThis enables us to deal with multiple people and maintain identities through\nocclusion events. Our complete approach, 4DHumans, achieves state-of-the-art\nresults for tracking people from monocular video. Furthermore, we demonstrate\nthe effectiveness of HMR 2.0 on the downstream task of action recognition,\nachieving significant improvements over previous pose-based action recognition\napproaches. Our code and models are available on the project website:\nhttps://shubham-goel.github.io/4dhumans/.\n","authors":["Shubham Goel","Georgios Pavlakos","Jathushan Rajasegaran","Angjoo Kanazawa","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2305.20091v3.pdf","comment":"In ICCV 2023. Project Webpage:\n https://shubham-goel.github.io/4dhumans/"},{"id":"http://arxiv.org/abs/2301.00752v3","updated":"2023-08-31T16:28:50Z","published":"2023-01-02T16:51:40Z","title":"Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave\n Communications","summary":" This study demonstrates the feasibility of point cloud-based proactive link\nquality prediction for millimeter-wave (mmWave) communications. Previous\nstudies have proposed machine learning-based methods to predict received signal\nstrength for future time periods using time series of depth images to mitigate\nthe line-of-sight (LOS) path blockage by pedestrians in mmWave communication.\nHowever, these image-based methods have limited applicability due to privacy\nconcerns as camera images may contain sensitive information. This study\nproposes a point cloud-based method for mmWave link quality prediction and\ndemonstrates its feasibility through experiments. Point clouds represent\nthree-dimensional (3D) spaces as a set of points and are sparser and less\nlikely to contain sensitive information than camera images. Additionally, point\nclouds provide 3D position and motion information, which is necessary for\nunderstanding the radio propagation environment involving pedestrians. This\nstudy designs the mmWave link quality prediction method and conducts realistic\nindoor experiments, where the link quality fluctuates significantly due to\nhuman blockage, using commercially available IEEE 802.11ad-based 60 GHz\nwireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light\ndetection and ranging (LiDAR) for point cloud acquisition. The experimental\nresults showed that our proposed method can predict future large attenuation of\nmmWave received signal strength and throughput induced by the LOS path blockage\nby pedestrians with comparable or superior accuracy to image-based prediction\nmethods. Hence, our point cloud-based method can serve as a viable alternative\nto image-based methods.\n","authors":["Shoki Ohta","Takayuki Nishio","Riichi Kudo","Kahoko Takahashi","Hisashi Nagata"],"pdf_url":"https://arxiv.org/pdf/2301.00752v3.pdf","comment":"Submitted to IEEE Transactions on Machine Learning in Communications\n and Networking"},{"id":"http://arxiv.org/abs/2308.16847v1","updated":"2023-08-31T16:26:17Z","published":"2023-08-31T16:26:17Z","title":"Diffusion Models for Interferometric Satellite Aperture Radar","summary":" Probabilistic Diffusion Models (PDMs) have recently emerged as a very\npromising class of generative models, achieving high performance in natural\nimage generation. However, their performance relative to non-natural images,\nlike radar-based satellite data, remains largely unknown. Generating large\namounts of synthetic (and especially labelled) satellite data is crucial to\nimplement deep-learning approaches for the processing and analysis of\n(interferometric) satellite aperture radar data. Here, we leverage PDMs to\ngenerate several radar-based satellite image datasets. We show that PDMs\nsucceed in generating images with complex and realistic structures, but that\nsampling time remains an issue. Indeed, accelerated sampling strategies, which\nwork well on simple image datasets like MNIST, fail on our radar datasets. We\nprovide a simple and versatile open-source\nhttps://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and\nevaluate PDMs using any dataset on a single GPU.\n","authors":["Alexandre Tuel","Thomas Kerdreux","Claudia Hulbert","Bertrand Rouet-Leduc"],"pdf_url":"https://arxiv.org/pdf/2308.16847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16825v1","updated":"2023-08-31T15:56:29Z","published":"2023-08-31T15:56:29Z","title":"Coarse-to-Fine Amodal Segmentation with Shape Prior","summary":" Amodal object segmentation is a challenging task that involves segmenting\nboth visible and occluded parts of an object. In this paper, we propose a novel\napproach, called Coarse-to-Fine Segmentation (C2F-Seg), that addresses this\nproblem by progressively modeling the amodal segmentation. C2F-Seg initially\nreduces the learning space from the pixel-level image space to the\nvector-quantized latent space. This enables us to better handle long-range\ndependencies and learn a coarse-grained amodal segment from visual features and\nvisible segments. However, this latent space lacks detailed information about\nthe object, which makes it difficult to provide a precise segmentation\ndirectly. To address this issue, we propose a convolution refine module to\ninject fine-grained information and provide a more precise amodal object\nsegmentation based on visual features and coarse-predicted segmentation. To\nhelp the studies of amodal object segmentation, we create a synthetic amodal\ndataset, named as MOViD-Amodal (MOViD-A), which can be used for both image and\nvideo amodal object segmentation. We extensively evaluate our model on two\nbenchmark datasets: KINS and COCO-A. Our empirical results demonstrate the\nsuperiority of C2F-Seg. Moreover, we exhibit the potential of our approach for\nvideo amodal object segmentation tasks on FISHBOWL and our proposed MOViD-A.\nProject page at: http://jianxgao.github.io/C2F-Seg.\n","authors":["Jianxiong Gao","Xuelin Qian","Yikai Wang","Tianjun Xiao","Tong He","Zheng Zhang","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16825v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16819v1","updated":"2023-08-31T15:49:53Z","published":"2023-08-31T15:49:53Z","title":"BTSeg: Barlow Twins Regularization for Domain Adaptation in Semantic\n Segmentation","summary":" Semantic image segmentation is a critical component in many computer vision\nsystems, such as autonomous driving. In such applications, adverse conditions\n(heavy rain, night time, snow, extreme lighting) on the one hand pose specific\nchallenges, yet are typically underrepresented in the available datasets.\nGenerating more training data is cumbersome and expensive, and the process\nitself is error-prone due to the inherent aleatoric uncertainty. To address\nthis challenging problem, we propose BTSeg, which exploits image-level\ncorrespondences as weak supervision signal to learn a segmentation model that\nis agnostic to adverse conditions. To this end, our approach uses the Barlow\ntwins loss from the field of unsupervised learning and treats images taken at\nthe same location but under different adverse conditions as \"augmentations\" of\nthe same unknown underlying base image. This allows the training of a\nsegmentation model that is robust to appearance changes introduced by different\nadverse conditions. We evaluate our approach on ACDC and the new challenging\nACG benchmark to demonstrate its robustness and generalization capabilities.\nOur approach performs favorably when compared to the current state-of-the-art\nmethods, while also being simpler to implement and train. The code will be\nreleased upon acceptance.\n","authors":["Johannes Künzel","Anna Hilsmann","Peter Eisert"],"pdf_url":"https://arxiv.org/pdf/2308.16819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15975v2","updated":"2023-08-31T15:29:44Z","published":"2023-08-30T11:57:04Z","title":"RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation","summary":" For robots to be useful outside labs and specialized factories we need a way\nto teach them new useful behaviors quickly. Current approaches lack either the\ngenerality to onboard new tasks without task-specific engineering, or else lack\nthe data-efficiency to do so in an amount of time that enables practical use.\nIn this work we explore dense tracking as a representational vehicle to allow\nfaster and more general learning from demonstration. Our approach utilizes\nTrack-Any-Point (TAP) models to isolate the relevant motion in a demonstration,\nand parameterize a low-level controller to reproduce this motion across changes\nin the scene configuration. We show this results in robust robot policies that\ncan solve complex object-arrangement tasks such as shape-matching, stacking,\nand even full path-following tasks such as applying glue and sticking objects\ntogether, all from demonstrations that can be collected in minutes.\n","authors":["Mel Vecerik","Carl Doersch","Yi Yang","Todor Davchev","Yusuf Aytar","Guangyao Zhou","Raia Hadsell","Lourdes Agapito","Jon Scholz"],"pdf_url":"https://arxiv.org/pdf/2308.15975v2.pdf","comment":"Project website: https://robotap.github.io"},{"id":"http://arxiv.org/abs/2308.16801v1","updated":"2023-08-31T15:23:33Z","published":"2023-08-31T15:23:33Z","title":"Multiscale Residual Learning of Graph Convolutional Sequence Chunks for\n Human Motion Prediction","summary":" A new method is proposed for human motion prediction by learning temporal and\nspatial dependencies. Recently, multiscale graphs have been developed to model\nthe human body at higher abstraction levels, resulting in more stable motion\nprediction. Current methods however predetermine scale levels and combine\nspatially proximal joints to generate coarser scales based on human priors,\neven though movement patterns in different motion sequences vary and do not\nfully comply with a fixed graph of spatially connected joints. Another problem\nwith graph convolutional methods is mode collapse, in which predicted poses\nconverge around a mean pose with no discernible movements, particularly in\nlong-term predictions. To tackle these issues, we propose ResChunk, an\nend-to-end network which explores dynamically correlated body components based\non the pairwise relationships between all joints in individual sequences.\nResChunk is trained to learn the residuals between target sequence chunks in an\nautoregressive manner to enforce the temporal connectivities between\nconsecutive chunks. It is hence a sequence-to-sequence prediction network which\nconsiders dynamic spatio-temporal features of sequences at multiple levels. Our\nexperiments on two challenging benchmark datasets, CMU Mocap and Human3.6M,\ndemonstrate that our proposed method is able to effectively model the sequence\ninformation for motion prediction and outperform other techniques to set a new\nstate-of-the-art. Our code is available at\nhttps://github.com/MohsenZand/ResChunk.\n","authors":["Mohsen Zand","Ali Etemad","Michael Greenspan"],"pdf_url":"https://arxiv.org/pdf/2308.16801v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2308.16777v1","updated":"2023-08-31T14:55:30Z","published":"2023-08-31T14:55:30Z","title":"Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models","summary":" Zero-shot referring image segmentation is a challenging task because it aims\nto find an instance segmentation mask based on the given referring\ndescriptions, without training on this type of paired data. Current zero-shot\nmethods mainly focus on using pre-trained discriminative models (e.g., CLIP).\nHowever, we have observed that generative models (e.g., Stable Diffusion) have\npotentially understood the relationships between various visual elements and\ntext descriptions, which are rarely investigated in this task. In this work, we\nintroduce a novel Referring Diffusional segmentor (Ref-Diff) for this task,\nwhich leverages the fine-grained multi-modal information from generative\nmodels. We demonstrate that without a proposal generator, a generative model\nalone can achieve comparable performance to existing SOTA weakly-supervised\nmodels. When we combine both generative and discriminative models, our Ref-Diff\noutperforms these competing methods by a significant margin. This indicates\nthat generative models are also beneficial for this task and can complement\ndiscriminative models for better referring segmentation. Our code is publicly\navailable at https://github.com/kodenii/Ref-Diff.\n","authors":["Minheng Ni","Yabo Zhang","Kailai Feng","Xiaoming Li","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.16777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16758v1","updated":"2023-08-31T14:26:33Z","published":"2023-08-31T14:26:33Z","title":"Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation\n Using only Images","summary":" Generating 3D faces from textual descriptions has a multitude of\napplications, such as gaming, movie, and robotics. Recent progresses have\ndemonstrated the success of unconditional 3D face generation and text-to-3D\nshape generation. However, due to the limited text-3D face data pairs,\ntext-driven 3D face generation remains an open problem. In this paper, we\npropose a text-guided 3D faces generation method, refer as TG-3DFace, for\ngenerating realistic 3D faces using text guidance. Specifically, we adopt an\nunconditional 3D face generation framework and equip it with text conditions,\nwhich learns the text-guided 3D face generation with only text-2D face data. On\ntop of that, we propose two text-to-face cross-modal alignment techniques,\nincluding the global contrastive learning and the fine-grained alignment\nmodule, to facilitate high semantic consistency between generated 3D faces and\ninput texts. Besides, we present directional classifier guidance during the\ninference process, which encourages creativity for out-of-domain generations.\nCompared to the existing methods, TG-3DFace creates more realistic and\naesthetically pleasing 3D faces, boosting 9% multi-view consistency (MVIC) over\nLatent3D. The rendered face images generated by TG-3DFace achieve higher FID\nand CLIP score than text-to-2D face/image generation models, demonstrating our\nsuperiority in generating realistic and semantic-consistent textures.\n","authors":["Cuican Yu","Guansong Lu","Yihan Zeng","Jian Sun","Xiaodan Liang","Huibin Li","Zongben Xu","Songcen Xu","Wei Zhang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.16758v1.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.13241v4","updated":"2023-08-31T14:15:53Z","published":"2023-03-23T13:18:05Z","title":"6D Object Pose Estimation from Approximate 3D Models for Orbital\n Robotics","summary":" We present a novel technique to estimate the 6D pose of objects from single\nimages where the 3D geometry of the object is only given approximately and not\nas a precise 3D model. To achieve this, we employ a dense 2D-to-3D\ncorrespondence predictor that regresses 3D model coordinates for every pixel.\nIn addition to the 3D coordinates, our model also estimates the pixel-wise\ncoordinate error to discard correspondences that are likely wrong. This allows\nus to generate multiple 6D pose hypotheses of the object, which we then refine\niteratively using a highly efficient region-based approach. We also introduce a\nnovel pixel-wise posterior formulation by which we can estimate the probability\nfor each hypothesis and select the most likely one. As we show in experiments,\nour approach is capable of dealing with extreme visual conditions including\noverexposure, high contrast, or low signal-to-noise ratio. This makes it a\npowerful technique for the particularly challenging task of estimating the pose\nof tumbling satellites for in-orbit robotic applications. Our method achieves\nstate-of-the-art performance on the SPEED+ dataset and has won the SPEC2021\npost-mortem competition.\n","authors":["Maximilian Ulmer","Maximilian Durner","Martin Sundermeyer","Manuel Stoiber","Rudolph Triebel"],"pdf_url":"https://arxiv.org/pdf/2303.13241v4.pdf","comment":"Proceedings of IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2304.05821v2","updated":"2023-08-31T14:15:51Z","published":"2023-04-12T12:59:02Z","title":"DUFormer: Solving Power Line Detection Task in Aerial Images using\n Semantic Segmentation","summary":" Unmanned aerial vehicles (UAVs) are frequently used for inspecting power\nlines and capturing high-resolution aerial images. However, detecting power\nlines in aerial images is difficult,as the foreground data(i.e, power lines) is\nsmall and the background information is abundant.To tackle this problem, we\nintroduce DUFormer, a semantic segmentation algorithm explicitly designed to\ndetect power lines in aerial images. We presuppose that it is advantageous to\ntrain an efficient Transformer model with sufficient feature extraction using a\nconvolutional neural network(CNN) with a strong inductive bias.With this goal\nin mind, we introduce a heavy token encoder that performs overlapping feature\nremodeling and tokenization. The encoder comprises a pyramid CNN feature\nextraction module and a power line feature enhancement module.After successful\nlocal feature extraction for power lines, feature fusion is conducted.Then,the\nTransformer block is used for global modeling. The final segmentation result is\nachieved by amalgamating local and global features in the decode head.Moreover,\nwe demonstrate the importance of the joint multi-weight loss function in power\nline segmentation. Our experimental results show that our proposed method\noutperforms all state-of-the-art methods in power line segmentation on the\npublicly accessible TTPLA dataset.\n","authors":["Deyu An","Qiang Zhang","Jianshu Chao","Ting Li","Feng Qiao","Yong Deng","Zhenpeng Bian"],"pdf_url":"https://arxiv.org/pdf/2304.05821v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16742v1","updated":"2023-08-31T14:00:47Z","published":"2023-08-31T14:00:47Z","title":"Unsupervised CT Metal Artifact Reduction by Plugging Diffusion Priors in\n Dual Domains","summary":" During the process of computed tomography (CT), metallic implants often cause\ndisruptive artifacts in the reconstructed images, impeding accurate diagnosis.\nSeveral supervised deep learning-based approaches have been proposed for\nreducing metal artifacts (MAR). However, these methods heavily rely on training\nwith simulated data, as obtaining paired metal artifact CT and clean CT data in\nclinical settings is challenging. This limitation can lead to decreased\nperformance when applying these methods in clinical practice. Existing\nunsupervised MAR methods, whether based on learning or not, typically operate\nwithin a single domain, either in the image domain or the sinogram domain. In\nthis paper, we propose an unsupervised MAR method based on the diffusion model,\na generative model with a high capacity to represent data distributions.\nSpecifically, we first train a diffusion model using CT images without metal\nartifacts. Subsequently, we iteratively utilize the priors embedded within the\npre-trained diffusion model in both the sinogram and image domains to restore\nthe degraded portions caused by metal artifacts. This dual-domain processing\nempowers our approach to outperform existing unsupervised MAR methods,\nincluding another MAR method based on the diffusion model, which we have\nqualitatively and quantitatively validated using synthetic datasets. Moreover,\nour method demonstrates superior visual results compared to both supervised and\nunsupervised methods on clinical datasets.\n","authors":["Xuan Liu","Yaoqin Xie","Songhui Diao","Shan Tan","Xiaokun Liang"],"pdf_url":"https://arxiv.org/pdf/2308.16742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16741v1","updated":"2023-08-31T13:59:35Z","published":"2023-08-31T13:59:35Z","title":"Socratis: Are large multimodal models emotionally aware?","summary":" Existing emotion prediction benchmarks contain coarse emotion labels which do\nnot consider the diversity of emotions that an image and text can elicit in\nhumans due to various reasons. Learning diverse reactions to multimodal content\nis important as intelligent machines take a central role in generating and\ndelivering content to society. To address this gap, we propose Socratis, a\n\\underline{soc}ietal \\underline{r}e\\underline{a}c\\underline{ti}on\\underline{s}\nbenchmark, where each image-caption (IC) pair is annotated with multiple\nemotions and the reasons for feeling them. Socratis contains 18K free-form\nreactions for 980 emotions on 2075 image-caption pairs from 5 widely-read news\nand image-caption (IC) datasets. We benchmark the capability of\nstate-of-the-art multimodal large language models to generate the reasons for\nfeeling an emotion given an IC pair. Based on a preliminary human study, we\nobserve that humans prefer human-written reasons over 2 times more often than\nmachine-generated ones. This shows our task is harder than standard generation\ntasks because it starkly contrasts recent findings where humans cannot tell\napart machine vs human-written news articles, for instance. We further see that\ncurrent captioning metrics based on large vision-language models also fail to\ncorrelate with human preferences. We hope that these findings and our benchmark\nwill inspire further research on training emotionally aware models.\n","authors":["Katherine Deng","Arijit Ray","Reuben Tan","Saadia Gabriel","Bryan A. Plummer","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2308.16741v1.pdf","comment":"ICCV 2023 WECIA"},{"id":"http://arxiv.org/abs/2308.16739v1","updated":"2023-08-31T13:57:38Z","published":"2023-08-31T13:57:38Z","title":"Parsing is All You Need for Accurate Gait Recognition in the Wild","summary":" Binary silhouettes and keypoint-based skeletons have dominated human gait\nrecognition studies for decades since they are easy to extract from video\nframes. Despite their success in gait recognition for in-the-lab environments,\nthey usually fail in real-world scenarios due to their low information entropy\nfor gait representations. To achieve accurate gait recognition in the wild,\nthis paper presents a novel gait representation, named Gait Parsing Sequence\n(GPS). GPSs are sequences of fine-grained human segmentation, i.e., human\nparsing, extracted from video frames, so they have much higher information\nentropy to encode the shapes and dynamics of fine-grained human parts during\nwalking. Moreover, to effectively explore the capability of the GPS\nrepresentation, we propose a novel human parsing-based gait recognition\nframework, named ParsingGait. ParsingGait contains a Convolutional Neural\nNetwork (CNN)-based backbone and two light-weighted heads. The first head\nextracts global semantic features from GPSs, while the other one learns mutual\ninformation of part-level features through Graph Convolutional Networks to\nmodel the detailed dynamics of human walking. Furthermore, due to the lack of\nsuitable datasets, we build the first parsing-based dataset for gait\nrecognition in the wild, named Gait3D-Parsing, by extending the large-scale and\nchallenging Gait3D dataset. Based on Gait3D-Parsing, we comprehensively\nevaluate our method and existing gait recognition methods. The experimental\nresults show a significant improvement in accuracy brought by the GPS\nrepresentation and the superiority of ParsingGait. The code and dataset are\navailable at https://gait3d.github.io/gait3d-parsing-hp .\n","authors":["Jinkai Zheng","Xinchen Liu","Shuai Wang","Lihao Wang","Chenggang Yan","Wu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16739v1.pdf","comment":"16 pages, 14 figures, ACM MM 2023 accepted, project page:\n https://gait3d.github.io/gait3d-parsing-hp"},{"id":"http://arxiv.org/abs/2308.16738v1","updated":"2023-08-31T13:54:57Z","published":"2023-08-31T13:54:57Z","title":"US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for\n Cervical Lymph Node Lesions Diagnoses in Ultrasound Images","summary":" Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph\nnode lesions. However, the diagnoses of these images largely hinge on the\nexpertise of medical practitioners, rendering the process susceptible to\nmisdiagnoses. Although rapidly developing deep learning has substantially\nimproved the diagnoses of diverse ultrasound images, there remains a\nconspicuous research gap concerning cervical lymph nodes. The objective of our\nwork is to accurately diagnose cervical lymph node lesions by leveraging a deep\nlearning model. To this end, we first collected 3392 images containing normal\nlymph nodes, benign lymph node lesions, malignant primary lymph node lesions,\nand malignant metastatic lymph node lesions. Given that ultrasound images are\ngenerated by the reflection and scattering of sound waves across varied bodily\ntissues, we proposed the Conv-FFT Block. It integrates convolutional operations\nwith the fast Fourier transform to more astutely model the images. Building\nupon this foundation, we designed a novel architecture, named US-SFNet. This\narchitecture not only discerns variances in ultrasound images from the spatial\ndomain but also adeptly captures microstructural alterations across various\nlesions in the frequency domain. To ascertain the potential of US-SFNet, we\nbenchmarked it against 12 popular architectures through five-fold\ncross-validation. The results show that US-SFNet is SOTA and can achieve 92.89%\naccuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity,\nrespectively.\n","authors":["Yubiao Yue","Jun Xue","Haihua Liang","Bingchun Luo","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.16738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16735v1","updated":"2023-08-31T13:52:28Z","published":"2023-08-31T13:52:28Z","title":"Post-Deployment Adaptation with Access to Source Data via Federated\n Learning and Source-Target Remote Gradient Alignment","summary":" Deployment of Deep Neural Networks in medical imaging is hindered by\ndistribution shift between training data and data processed after deployment,\ncausing performance degradation. Post-Deployment Adaptation (PDA) addresses\nthis by tailoring a pre-trained, deployed model to the target data distribution\nusing limited labelled or entirely unlabelled target data, while assuming no\naccess to source training data as they cannot be deployed with the model due to\nprivacy concerns and their large size. This makes reliable adaptation\nchallenging due to limited learning signal. This paper challenges this\nassumption and introduces FedPDA, a novel adaptation framework that brings the\nutility of learning from remote data from Federated Learning into PDA. FedPDA\nenables a deployed model to obtain information from source data via remote\ngradient exchange, while aiming to optimize the model specifically for the\ntarget domain. Tailored for FedPDA, we introduce a novel optimization method\nStarAlign (Source-Target Remote Gradient Alignment) that aligns gradients\nbetween source-target domain pairs by maximizing their inner product, to\nfacilitate learning a target-specific model. We demonstrate the method's\neffectiveness using multi-center databases for the tasks of cancer metastases\ndetection and skin lesion classification, where our method compares favourably\nto previous work. Code is available at: https://github.com/FelixWag/StarAlign\n","authors":["Felix Wagner","Zeju Li","Pramit Saha","Konstantinos Kamnitsas"],"pdf_url":"https://arxiv.org/pdf/2308.16735v1.pdf","comment":"This version was accepted for the Machine Learning in Medical Imaging\n (MLMI 2023) workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.17595v2","updated":"2023-08-31T13:45:28Z","published":"2023-06-30T12:14:13Z","title":"RBSR: Efficient and Flexible Recurrent Network for Burst\n Super-Resolution","summary":" Burst super-resolution (BurstSR) aims at reconstructing a high-resolution\n(HR) image from a sequence of low-resolution (LR) and noisy images, which is\nconducive to enhancing the imaging effects of smartphones with limited sensors.\nThe main challenge of BurstSR is to effectively combine the complementary\ninformation from input frames, while existing methods still struggle with it.\nIn this paper, we suggest fusing cues frame-by-frame with an efficient and\nflexible recurrent network. In particular, we emphasize the role of the\nbase-frame and utilize it as a key prompt to guide the knowledge acquisition\nfrom other frames in every recurrence. Moreover, we introduce an implicit\nweighting loss to improve the model's flexibility in facing input frames with\nvariable numbers. Extensive experiments on both synthetic and real-world\ndatasets demonstrate that our method achieves better results than\nstate-of-the-art ones. Codes and pre-trained models are available at\nhttps://github.com/ZcsrenlongZ/RBSR.\n","authors":["Renlong Wu","Zhilu Zhang","Shuohao Zhang","Hongzhi Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2306.17595v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.16725v1","updated":"2023-08-31T13:41:34Z","published":"2023-08-31T13:41:34Z","title":"Terrain Diffusion Network: Climatic-Aware Terrain Generation with\n Geological Sketch Guidance","summary":" Sketch-based terrain generation seeks to create realistic landscapes for\nvirtual environments in various applications such as computer games, animation\nand virtual reality. Recently, deep learning based terrain generation has\nemerged, notably the ones based on generative adversarial networks (GAN).\nHowever, these methods often struggle to fulfill the requirements of flexible\nuser control and maintain generative diversity for realistic terrain.\nTherefore, we propose a novel diffusion-based method, namely terrain diffusion\nnetwork (TDN), which actively incorporates user guidance for enhanced\ncontrollability, taking into account terrain features like rivers, ridges,\nbasins, and peaks. Instead of adhering to a conventional monolithic denoising\nprocess, which often compromises the fidelity of terrain details or the\nalignment with user control, a multi-level denoising scheme is proposed to\ngenerate more realistic terrains by taking into account fine-grained details,\nparticularly those related to climatic patterns influenced by erosion and\ntectonic activities. Specifically, three terrain synthesisers are designed for\nstructural, intermediate, and fine-grained level denoising purposes, which\nallow each synthesiser concentrate on a distinct terrain aspect. Moreover, to\nmaximise the efficiency of our TDN, we further introduce terrain and sketch\nlatent spaces for the synthesizers with pre-trained terrain autoencoders.\nComprehensive experiments on a new dataset constructed from NASA Topology\nImages clearly demonstrate the effectiveness of our proposed method, achieving\nthe state-of-the-art performance. Our code and dataset will be publicly\navailable.\n","authors":["Zexin Hu","Kun Hu","Clinton Mo","Lei Pan","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16714v1","updated":"2023-08-31T13:28:32Z","published":"2023-08-31T13:28:32Z","title":"Towards Vehicle-to-everything Autonomous Driving: A Survey on\n Collaborative Perception","summary":" Vehicle-to-everything (V2X) autonomous driving opens up a promising direction\nfor developing a new generation of intelligent transportation systems.\nCollaborative perception (CP) as an essential component to achieve V2X can\novercome the inherent limitations of individual perception, including occlusion\nand long-range perception. In this survey, we provide a comprehensive review of\nCP methods for V2X scenarios, bringing a profound and in-depth understanding to\nthe community. Specifically, we first introduce the architecture and workflow\nof typical V2X systems, which affords a broader perspective to understand the\nentire V2X system and the role of CP within it. Then, we thoroughly summarize\nand analyze existing V2X perception datasets and CP methods. Particularly, we\nintroduce numerous CP methods from various crucial perspectives, including\ncollaboration stages, roadside sensors placement, latency compensation,\nperformance-bandwidth trade-off, attack/defense, pose alignment, etc. Moreover,\nwe conduct extensive experimental analyses to compare and examine current CP\nmethods, revealing some essential and unexplored insights. Specifically, we\nanalyze the performance changes of different methods under different\nbandwidths, providing a deep insight into the performance-bandwidth trade-off\nissue. Also, we examine methods under different LiDAR ranges. To study the\nmodel robustness, we further investigate the effects of various simulated\nreal-world noises on the performance of different CP methods, covering\ncommunication latency, lossy communication, localization errors, and mixed\nnoises. In addition, we look into the sim-to-real generalization ability of\nexisting CP methods. At last, we thoroughly discuss issues and challenges,\nhighlighting promising directions for future efforts. Our codes for\nexperimental analysis will be public at\nhttps://github.com/memberRE/Collaborative-Perception.\n","authors":["Si Liu","Chen Gao","Yuan Chen","Xingyu Peng","Xianghao Kong","Kun Wang","Runsheng Xu","Wentao Jiang","Hao Xiang","Jiaqi Ma","Miao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16714v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2307.14863v3","updated":"2023-08-31T13:25:59Z","published":"2023-07-27T13:49:27Z","title":"IML-ViT: Benchmarking Image Manipulation Localization by Vision\n Transformer","summary":" Advanced image tampering techniques are increasingly challenging the\ntrustworthiness of multimedia, leading to the development of Image Manipulation\nLocalization (IML). But what makes a good IML model? The answer lies in the way\nto capture artifacts. Exploiting artifacts requires the model to extract\nnon-semantic discrepancies between manipulated and authentic regions,\nnecessitating explicit comparisons between the two areas. With the\nself-attention mechanism, naturally, the Transformer should be a better\ncandidate to capture artifacts. However, due to limited datasets, there is\ncurrently no pure ViT-based approach for IML to serve as a benchmark, and CNNs\ndominate the entire task. Nevertheless, CNNs suffer from weak long-range and\nnon-semantic modeling. To bridge this gap, based on the fact that artifacts are\nsensitive to image resolution, amplified under multi-scale features, and\nmassive at the manipulation border, we formulate the answer to the former\nquestion as building a ViT with high-resolution capacity, multi-scale feature\nextraction capability, and manipulation edge supervision that could converge\nwith a small amount of data. We term this simple but effective ViT paradigm\nIML-ViT, which has significant potential to become a new benchmark for IML.\nExtensive experiments on five benchmark datasets verified our model outperforms\nthe state-of-the-art manipulation localization methods.Code and models are\navailable at \\url{https://github.com/SunnyHaze/IML-ViT}.\n","authors":["Xiaochen Ma","Bo Du","Zhuohang Jiang","Ahmed Y. Al Hammadi","Jizhe Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.14863v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14505v2","updated":"2023-08-31T13:10:04Z","published":"2023-04-03T11:45:27Z","title":"Transformer-based interpretable multi-modal data fusion for skin lesion\n classification","summary":" A lot of deep learning (DL) research these days is mainly focused on\nimproving quantitative metrics regardless of other factors. In human-centered\napplications, like skin lesion classification in dermatology, DL-driven\nclinical decision support systems are still in their infancy due to the limited\ntransparency of their decision-making process. Moreover, the lack of procedures\nthat can explain the behavior of trained DL algorithms leads to almost no trust\nfrom clinical physicians. To diagnose skin lesions, dermatologists rely on\nvisual assessment of the disease and the data gathered from the patient's\nanamnesis. Data-driven algorithms dealing with multi-modal data are limited by\nthe separation of feature-level and decision-level fusion procedures required\nby convolutional architectures. To address this issue, we enable single-stage\nmulti-modal data fusion via the attention mechanism of transformer-based\narchitectures to aid in diagnosing skin diseases. Our method beats other\nstate-of-the-art single- and multi-modal DL architectures in image-rich and\npatient-data-rich environments. Additionally, the choice of the architecture\nenables native interpretability support for the classification task both in the\nimage and metadata domain with no additional modifications necessary.\n","authors":["Theodor Cheslerean-Boghiu","Melia-Evelina Fleischmann","Theresa Willem","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2304.14505v2.pdf","comment":"Submitted to IEEE JBHI in July 2023"},{"id":"http://arxiv.org/abs/2303.07806v2","updated":"2023-08-31T13:00:55Z","published":"2023-03-14T11:25:02Z","title":"USAGE: A Unified Seed Area Generation Paradigm for Weakly Supervised\n Semantic Segmentation","summary":" Seed area generation is usually the starting point of weakly supervised\nsemantic segmentation (WSSS). Computing the Class Activation Map (CAM) from a\nmulti-label classification network is the de facto paradigm for seed area\ngeneration, but CAMs generated from Convolutional Neural Networks (CNNs) and\nTransformers are prone to be under- and over-activated, respectively, which\nmakes the strategies to refine CAMs for CNNs usually inappropriate for\nTransformers, and vice versa. In this paper, we propose a Unified optimization\nparadigm for Seed Area GEneration (USAGE) for both types of networks, in which\nthe objective function to be optimized consists of two terms: One is a\ngeneration loss, which controls the shape of seed areas by a temperature\nparameter following a deterministic principle for different types of networks;\nThe other is a regularization loss, which ensures the consistency between the\nseed areas that are generated by self-adaptive network adjustment from\ndifferent views, to overturn false activation in seed areas. Experimental\nresults show that USAGE consistently improves seed area generation for both\nCNNs and Transformers by large margins, e.g., outperforming state-of-the-art\nmethods by a mIoU of 4.1% on PASCAL VOC. Moreover, based on the USAGE-generated\nseed areas on Transformers, we achieve state-of-the-art WSSS results on both\nPASCAL VOC and MS COCO.\n","authors":["Zelin Peng","Guanchun Wang","Lingxi Xie","Dongsheng Jiang","Wei Shen","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2303.07806v2.pdf","comment":"ICCV 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2308.16689v1","updated":"2023-08-31T12:46:36Z","published":"2023-08-31T12:46:36Z","title":"ViLTA: Enhancing Vision-Language Pre-training through Textual\n Augmentation","summary":" Vision-language pre-training (VLP) methods are blossoming recently, and its\ncrucial goal is to jointly learn visual and textual features via a\ntransformer-based architecture, demonstrating promising improvements on a\nvariety of vision-language tasks. Prior arts usually focus on how to align\nvisual and textual features, but strategies for improving the robustness of\nmodel and speeding up model convergence are left insufficiently explored.\n In this paper, we propose a novel method ViLTA, comprising of two components\nto further facilitate the model to learn fine-grained representations among\nimage-text pairs. For Masked Language Modeling (MLM), we propose a\ncross-distillation method to generate soft labels to enhance the robustness of\nmodel, which alleviates the problem of treating synonyms of masked words as\nnegative samples in one-hot labels. For Image-Text Matching (ITM), we leverage\nthe current language encoder to synthesize hard negatives based on the context\nof language input, encouraging the model to learn high-quality representations\nby increasing the difficulty of the ITM task. By leveraging the above\ntechniques, our ViLTA can achieve better performance on various vision-language\ntasks. Extensive experiments on benchmark datasets demonstrate that the\neffectiveness of ViLTA and its promising potential for vision-language\npre-training.\n","authors":["Weihan Wang","Zhen Yang","Bin Xu","Juanzi Li","Yankui Sun"],"pdf_url":"https://arxiv.org/pdf/2308.16689v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2112.08060v2","updated":"2023-08-31T12:41:13Z","published":"2021-12-15T11:55:11Z","title":"Leveraging Image-based Generative Adversarial Networks for Time Series\n Generation","summary":" Generative models for images have gained significant attention in computer\nvision and natural language processing due to their ability to generate\nrealistic samples from complex data distributions. To leverage the advances of\nimage-based generative models for the time series domain, we propose a\ntwo-dimensional image representation for time series, the Extended\nIntertemporal Return Plot (XIRP). Our approach captures the intertemporal time\nseries dynamics in a scale-invariant and invertible way, reducing training time\nand improving sample quality. We benchmark synthetic XIRPs obtained by an\noff-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image\nrepresentations and models regarding similarity and predictive ability metrics.\nOur novel, validated image representation for time series consistently and\nsignificantly outperforms a state-of-the-art RNN-based generative model\nregarding predictive ability. Further, we introduce an improved stochastic\ninversion to substantially improve simulation quality regardless of the\nrepresentation and provide the prospect of transfer potentials in other\ndomains.\n","authors":["Justin Hellermann","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2112.08060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16684v1","updated":"2023-08-31T12:38:29Z","published":"2023-08-31T12:38:29Z","title":"Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor\n Attack","summary":" The vulnerabilities to backdoor attacks have recently threatened the\ntrustworthiness of machine learning models in practical applications.\nConventional wisdom suggests that not everyone can be an attacker since the\nprocess of designing the trigger generation algorithm often involves\nsignificant effort and extensive experimentation to ensure the attack's\nstealthiness and effectiveness. Alternatively, this paper shows that there\nexists a more severe backdoor threat: anyone can exploit an easily-accessible\nalgorithm for silent backdoor attacks. Specifically, this attacker can employ\nthe widely-used lossy image compression from a plethora of compression tools to\neffortlessly inject a trigger pattern into an image without leaving any\nnoticeable trace; i.e., the generated triggers are natural artifacts. One does\nnot require extensive knowledge to click on the \"convert\" or \"save as\" button\nwhile using tools for lossy image compression. Via this attack, the adversary\ndoes not need to design a trigger generator as seen in prior works and only\nrequires poisoning the data. Empirically, the proposed attack consistently\nachieves 100% attack success rate in several benchmark datasets such as MNIST,\nCIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still\nachieve almost 100% attack success rate with very small (approximately 10%)\npoisoning rates in the clean label setting. The generated trigger of the\nproposed attack using one lossy compression algorithm is also transferable\nacross other related compression algorithms, exacerbating the severity of this\nbackdoor threat. This work takes another crucial step toward understanding the\nextensive risks of backdoor attacks in practice, urging practitioners to\ninvestigate similar attacks and relevant backdoor mitigation methods.\n","authors":["Sze Jue Yang","Quang Nguyen","Chee Seng Chan","Khoa Doan"],"pdf_url":"https://arxiv.org/pdf/2308.16684v1.pdf","comment":"14 pages. This paper shows everyone can mount a powerful and stealthy\n backdoor attack with the widely-used lossy image compression"},{"id":"http://arxiv.org/abs/2308.16682v1","updated":"2023-08-31T12:36:50Z","published":"2023-08-31T12:36:50Z","title":"Diffusion Inertial Poser: Human Motion Reconstruction from Arbitrary\n Sparse IMU Configurations","summary":" Motion capture from a limited number of inertial measurement units (IMUs) has\nimportant applications in health, human performance, and virtual reality.\nReal-world limitations and application-specific goals dictate different IMU\nconfigurations (i.e., number of IMUs and chosen attachment body segments),\ntrading off accuracy and practicality. Although recent works were successful in\naccurately reconstructing whole-body motion from six IMUs, these systems only\nwork with a specific IMU configuration. Here we propose a single diffusion\ngenerative model, Diffusion Inertial Poser (DiffIP), which reconstructs human\nmotion in real-time from arbitrary IMU configurations. We show that DiffIP has\nthe benefit of flexibility with respect to the IMU configuration while being as\naccurate as the state-of-the-art for the commonly used six IMU configuration.\nOur system enables selecting an optimal configuration for different\napplications without retraining the model. For example, when only four IMUs are\navailable, DiffIP found that the configuration that minimizes errors in joint\nkinematics instruments the thighs and forearms. However, global translation\nreconstruction is better when instrumenting the feet instead of the thighs.\nAlthough our approach is agnostic to the underlying model, we built DiffIP\nbased on physiologically realistic musculoskeletal models to enable use in\nbiomedical research and health applications.\n","authors":["Tom Van Wouwe","Seunghwan Lee","Antoine Falisse","Scott Delp","C. Karen Liu"],"pdf_url":"https://arxiv.org/pdf/2308.16682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10892v2","updated":"2023-08-31T12:09:45Z","published":"2022-11-20T07:30:15Z","title":"Towards Realistic Out-of-Distribution Detection: A Novel Evaluation\n Framework for Improving Generalization in OOD Detection","summary":" This paper presents a novel evaluation framework for Out-of-Distribution\n(OOD) detection that aims to assess the performance of machine learning models\nin more realistic settings. We observed that the real-world requirements for\ntesting OOD detection methods are not satisfied by the current testing\nprotocols. They usually encourage methods to have a strong bias towards a low\nlevel of diversity in normal data. To address this limitation, we propose new\nOOD test datasets (CIFAR-10-R, CIFAR-100-R, and ImageNet-30-R) that can allow\nresearchers to benchmark OOD detection performance under realistic distribution\nshifts. Additionally, we introduce a Generalizability Score (GS) to measure the\ngeneralization ability of a model during OOD detection. Our experiments\ndemonstrate that improving the performance on existing benchmark datasets does\nnot necessarily improve the usability of OOD detection models in real-world\nscenarios. While leveraging deep pre-trained features has been identified as a\npromising avenue for OOD detection research, our experiments show that\nstate-of-the-art pre-trained models tested on our proposed datasets suffer a\nsignificant drop in performance. To address this issue, we propose a\npost-processing stage for adapting pre-trained features under these\ndistribution shifts before calculating the OOD scores, which significantly\nenhances the performance of state-of-the-art pre-trained models on our\nbenchmarks.\n","authors":["Vahid Reza Khazaie","Anthony Wong","Mohammad Sabokrou"],"pdf_url":"https://arxiv.org/pdf/2211.10892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14500v3","updated":"2023-08-31T12:02:47Z","published":"2023-08-28T11:20:48Z","title":"LAC: Latent Action Composition for Skeleton-based Action Segmentation","summary":" Skeleton-based action segmentation requires recognizing composable actions in\nuntrimmed videos. Current approaches decouple this problem by first extracting\nlocal visual features from skeleton sequences and then processing them by a\ntemporal model to classify frame-wise actions. However, their performances\nremain limited as the visual features cannot sufficiently express composable\nactions. In this context, we propose Latent Action Composition (LAC), a novel\nself-supervised framework aiming at learning from synthesized composable\nmotions for skeleton-based action segmentation. LAC is composed of a novel\ngeneration module towards synthesizing new sequences. Specifically, we design a\nlinear latent space in the generator to represent primitive motion. New\ncomposed motions can be synthesized by simply performing arithmetic operations\non latent representations of multiple input skeleton sequences. LAC leverages\nsuch synthesized sequences, which have large diversity and complexity, for\nlearning visual representations of skeletons in both sequence and frame spaces\nvia contrastive learning. The resulting visual encoder has a high expressive\npower and can be effectively transferred onto action segmentation tasks by\nend-to-end fine-tuning without the need for additional temporal models. We\nconduct a study focusing on transfer-learning and we show that representations\nlearned from pre-trained LAC outperform the state-of-the-art by a large margin\non TSU, Charades, PKU-MMD datasets.\n","authors":["Di Yang","Yaohui Wang","Antitza Dantcheva","Quan Kong","Lorenzo Garattoni","Gianpiero Francesca","Francois Bremond"],"pdf_url":"https://arxiv.org/pdf/2308.14500v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16651v1","updated":"2023-08-31T11:51:16Z","published":"2023-08-31T11:51:16Z","title":"SoccerNet 2023 Tracking Challenge -- 3rd place MOT4MOT Team Technical\n Report","summary":" The SoccerNet 2023 tracking challenge requires the detection and tracking of\nsoccer players and the ball. In this work, we present our approach to tackle\nthese tasks separately. We employ a state-of-the-art online multi-object\ntracker and a contemporary object detector for player tracking. To overcome the\nlimitations of our online approach, we incorporate a post-processing stage\nusing interpolation and appearance-free track merging. Additionally, an\nappearance-based track merging technique is used to handle the termination and\ncreation of tracks far from the image boundaries. Ball tracking is formulated\nas single object detection, and a fine-tuned YOLOv8l detector with proprietary\nfiltering improves the detection precision. Our method achieves 3rd place on\nthe SoccerNet 2023 tracking challenge with a HOTA score of 66.27.\n","authors":["Gal Shitrit","Ishay Be'ery","Ido Yerhushalmy"],"pdf_url":"https://arxiv.org/pdf/2308.16651v1.pdf","comment":"3 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.16649v1","updated":"2023-08-31T11:46:27Z","published":"2023-08-31T11:46:27Z","title":"Learning with Multi-modal Gradient Attention for Explainable Composed\n Image Retrieval","summary":" We consider the problem of composed image retrieval that takes an input query\nconsisting of an image and a modification text indicating the desired changes\nto be made on the image and retrieves images that match these changes. Current\nstate-of-the-art techniques that address this problem use global features for\nthe retrieval, resulting in incorrect localization of the regions of interest\nto be modified because of the global nature of the features, more so in cases\nof real-world, in-the-wild images. Since modifier texts usually correspond to\nspecific local changes in an image, it is critical that models learn local\nfeatures to be able to both localize and retrieve better. To this end, our key\nnovelty is a new gradient-attention-based learning objective that explicitly\nforces the model to focus on the local regions of interest being modified in\neach retrieval step. We achieve this by first proposing a new visual image\nattention computation technique, which we call multi-modal gradient attention\n(MMGrad) that is explicitly conditioned on the modifier text. We next\ndemonstrate how MMGrad can be incorporated into an end-to-end model training\nstrategy with a new learning objective that explicitly forces these MMGrad\nattention maps to highlight the correct local regions corresponding to the\nmodifier text. By training retrieval models with this new loss function, we\nshow improved grounding by means of better visual attention maps, leading to\nbetter explainability of the models as well as competitive quantitative\nretrieval performance on standard benchmark datasets.\n","authors":["Prateksha Udhayanan","Srikrishna Karanam","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2308.16649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16648v1","updated":"2023-08-31T11:44:40Z","published":"2023-08-31T11:44:40Z","title":"Generate Your Own Scotland: Satellite Image Generation Conditioned on\n Maps","summary":" Despite recent advancements in image generation, diffusion models still\nremain largely underexplored in Earth Observation. In this paper we show that\nstate-of-the-art pretrained diffusion models can be conditioned on cartographic\ndata to generate realistic satellite images. We provide two large datasets of\npaired OpenStreetMap images and satellite views over the region of Mainland\nScotland and the Central Belt. We train a ControlNet model and qualitatively\nevaluate the results, demonstrating that both image quality and map fidelity\nare possible. Finally, we provide some insights on the opportunities and\nchallenges of applying these models for remote sensing. Our model weights and\ncode for creating the dataset are publicly available at\nhttps://github.com/miquel-espinosa/map-sat.\n","authors":["Miguel Espinosa","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2308.16648v1.pdf","comment":"13 pages, 6 figures. preprint"},{"id":"http://arxiv.org/abs/2302.07669v2","updated":"2023-08-31T11:24:15Z","published":"2023-02-15T14:06:39Z","title":"Unsupervised Hashing with Similarity Distribution Calibration","summary":" Unsupervised hashing methods typically aim to preserve the similarity between\ndata points in a feature space by mapping them to binary hash codes. However,\nthese methods often overlook the fact that the similarity between data points\nin the continuous feature space may not be preserved in the discrete hash code\nspace, due to the limited similarity range of hash codes. The similarity range\nis bounded by the code length and can lead to a problem known as similarity\ncollapse. That is, the positive and negative pairs of data points become less\ndistinguishable from each other in the hash space. To alleviate this problem,\nin this paper a novel Similarity Distribution Calibration (SDC) method is\nintroduced. SDC aligns the hash code similarity distribution towards a\ncalibration distribution (e.g., beta distribution) with sufficient spread\nacross the entire similarity range, thus alleviating the similarity collapse\nproblem. Extensive experiments show that our SDC outperforms significantly the\nstate-of-the-art alternatives on coarse category-level and instance-level image\nretrieval. Code is available at https://github.com/kamwoh/sdc.\n","authors":["Kam Woh Ng","Xiatian Zhu","Jiun Tian Hoe","Chee Seng Chan","Tianyu Zhang","Yi-Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2302.07669v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2308.16637v1","updated":"2023-08-31T11:11:38Z","published":"2023-08-31T11:11:38Z","title":"Learning Channel Importance for High Content Imaging with Interpretable\n Deep Input Channel Mixing","summary":" Uncovering novel drug candidates for treating complex diseases remain one of\nthe most challenging tasks in early discovery research. To tackle this\nchallenge, biopharma research established a standardized high content imaging\nprotocol that tags different cellular compartments per image channel. In order\nto judge the experimental outcome, the scientist requires knowledge about the\nchannel importance with respect to a certain phenotype for decoding the\nunderlying biology. In contrast to traditional image analysis approaches, such\nexperiments are nowadays preferably analyzed by deep learning based approaches\nwhich, however, lack crucial information about the channel importance. To\novercome this limitation, we present a novel approach which utilizes\nmulti-spectral information of high content images to interpret a certain aspect\nof cellular biology. To this end, we base our method on image blending concepts\nwith alpha compositing for an arbitrary number of channels. More specifically,\nwe introduce DCMIX, a lightweight, scaleable and end-to-end trainable mixing\nlayer which enables interpretable predictions in high content imaging while\nretaining the benefits of deep learning based methods. We employ an extensive\nset of experiments on both MNIST and RXRX1 datasets, demonstrating that DCMIX\nlearns the biologically relevant channel importance without scarifying\nprediction performance.\n","authors":["Daniel Siegismund","Mario Wieser","Stephan Heyse","Stephan Steigele"],"pdf_url":"https://arxiv.org/pdf/2308.16637v1.pdf","comment":"Accepted @ DAGM German Conference on Pattern Recognition (GCPR) 2023"},{"id":"http://arxiv.org/abs/2308.16635v1","updated":"2023-08-31T11:10:28Z","published":"2023-08-31T11:10:28Z","title":"MFR-Net: Multi-faceted Responsive Listening Head Generation via\n Denoising Diffusion Model","summary":" Face-to-face communication is a common scenario including roles of speakers\nand listeners. Most existing research methods focus on producing speaker\nvideos, while the generation of listener heads remains largely overlooked.\nResponsive listening head generation is an important task that aims to model\nface-to-face communication scenarios by generating a listener head video given\na speaker video and a listener head image. An ideal generated responsive\nlistening video should respond to the speaker with attitude or viewpoint\nexpressing while maintaining diversity in interaction patterns and accuracy in\nlistener identity information. To achieve this goal, we propose the\n\\textbf{M}ulti-\\textbf{F}aceted \\textbf{R}esponsive Listening Head Generation\nNetwork (MFR-Net). Specifically, MFR-Net employs the probabilistic denoising\ndiffusion model to predict diverse head pose and expression features. In order\nto perform multi-faceted response to the speaker video, while maintaining\naccurate listener identity preservation, we design the Feature Aggregation\nModule to boost listener identity features and fuse them with other\nspeaker-related features. Finally, a renderer finetuned with identity\nconsistency loss produces the final listening head videos. Our extensive\nexperiments demonstrate that MFR-Net not only achieves multi-faceted responses\nin diversity and speaker identity information but also in attitude and\nviewpoint expression.\n","authors":["Jin Liu","Xi Wang","Xiaomeng Fu","Yesheng Chai","Cai Yu","Jiao Dai","Jizhong Han"],"pdf_url":"https://arxiv.org/pdf/2308.16635v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.16633v1","updated":"2023-08-31T11:00:05Z","published":"2023-08-31T11:00:05Z","title":"Semi-Supervised SAR ATR Framework with Transductive Auxiliary\n Segmentation","summary":" Convolutional neural networks (CNNs) have achieved high performance in\nsynthetic aperture radar (SAR) automatic target recognition (ATR). However, the\nperformance of CNNs depends heavily on a large amount of training data. The\ninsufficiency of labeled training SAR images limits the recognition performance\nand even invalidates some ATR methods. Furthermore, under few labeled training\ndata, many existing CNNs are even ineffective. To address these challenges, we\npropose a Semi-supervised SAR ATR Framework with transductive Auxiliary\nSegmentation (SFAS). The proposed framework focuses on exploiting the\ntransductive generalization on available unlabeled samples with an auxiliary\nloss serving as a regularizer. Through auxiliary segmentation of unlabeled SAR\nsamples and information residue loss (IRL) in training, the framework can\nemploy the proposed training loop process and gradually exploit the information\ncompilation of recognition and segmentation to construct a helpful inductive\nbias and achieve high performance. Experiments conducted on the MSTAR dataset\nhave shown the effectiveness of our proposed SFAS for few-shot learning. The\nrecognition performance of 94.18\\% can be achieved under 20 training samples in\neach class with simultaneous accurate segmentation results. Facing variances of\nEOCs, the recognition ratios are higher than 88.00\\% when 10 training samples\neach class.\n","authors":["Chenwei Wang","Xiaoyu Liu","Yulin Huang","Siyi Luo","Jifang Pei","Jianyu Yang","Deqing Mao"],"pdf_url":"https://arxiv.org/pdf/2308.16633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16632v1","updated":"2023-08-31T11:00:03Z","published":"2023-08-31T11:00:03Z","title":"3D-STMN: Dependency-Driven Superpoint-Text Matching Network for\n End-to-End 3D Referring Expression Segmentation","summary":" In 3D Referring Expression Segmentation (3D-RES), the earlier approach adopts\na two-stage paradigm, extracting segmentation proposals and then matching them\nwith referring expressions. However, this conventional paradigm encounters\nsignificant challenges, most notably in terms of the generation of lackluster\ninitial proposals and a pronounced deceleration in inference speed. Recognizing\nthese limitations, we introduce an innovative end-to-end Superpoint-Text\nMatching Network (3D-STMN) that is enriched by dependency-driven insights. One\nof the keystones of our model is the Superpoint-Text Matching (STM) mechanism.\nUnlike traditional methods that navigate through instance proposals, STM\ndirectly correlates linguistic indications with their respective superpoints,\nclusters of semantically related points. This architectural decision empowers\nour model to efficiently harness cross-modal semantic relationships, primarily\nleveraging densely annotated superpoint-text pairs, as opposed to the more\nsparse instance-text pairs. In pursuit of enhancing the role of text in guiding\nthe segmentation process, we further incorporate the Dependency-Driven\nInteraction (DDI) module to deepen the network's semantic comprehension of\nreferring expressions. Using the dependency trees as a beacon, this module\ndiscerns the intricate relationships between primary terms and their associated\ndescriptors in expressions, thereby elevating both the localization and\nsegmentation capacities of our model. Comprehensive experiments on the\nScanRefer benchmark reveal that our model not only set new performance\nstandards, registering an mIoU gain of 11.7 points but also achieve a\nstaggering enhancement in inference speed, surpassing traditional methods by\n95.7 times. The code and models are available at\nhttps://github.com/sosppxo/3D-STMN.\n","authors":["Changli Wu","Yiwei Ma","Qi Chen","Haowei Wang","Gen Luo","Jiayi Ji","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2308.16632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16612v1","updated":"2023-08-31T10:19:23Z","published":"2023-08-31T10:19:23Z","title":"Neural Gradient Regularizer","summary":" Owing to its significant success, the prior imposed on gradient maps has\nconsistently been a subject of great interest in the field of image processing.\nTotal variation (TV), one of the most representative regularizers, is known for\nits ability to capture the sparsity of gradient maps. Nonetheless, TV and its\nvariants often underestimate the gradient maps, leading to the weakening of\nedges and details whose gradients should not be zero in the original image.\nRecently, total deep variation (TDV) has been introduced, assuming the sparsity\nof feature maps, which provides a flexible regularization learned from\nlarge-scale datasets for a specific task. However, TDV requires retraining when\nthe image or task changes, limiting its versatility. In this paper, we propose\na neural gradient regularizer (NGR) that expresses the gradient map as the\noutput of a neural network. Unlike existing methods, NGR does not rely on the\nsparsity assumption, thereby avoiding the underestimation of gradient maps. NGR\nis applicable to various image types and different image processing tasks,\nfunctioning in a zero-shot learning fashion, making it a versatile and\nplug-and-play regularizer. Extensive experimental results demonstrate the\nsuperior performance of NGR over state-of-the-art counterparts for a range of\ndifferent tasks, further validating its effectiveness and versatility.\n","authors":["Shuang Xu","Yifan Wang","Zixiang Zhao","Jiangjun Peng","Xiangyong Cao","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16611v1","updated":"2023-08-31T10:16:59Z","published":"2023-08-31T10:16:59Z","title":"Detecting Out-of-Context Image-Caption Pairs in News: A\n Counter-Intuitive Method","summary":" The growth of misinformation and re-contextualized media in social media and\nnews leads to an increasing need for fact-checking methods. Concurrently, the\nadvancement in generative models makes cheapfakes and deepfakes both easier to\nmake and harder to detect. In this paper, we present a novel approach using\ngenerative image models to our advantage for detecting Out-of-Context (OOC) use\nof images-caption pairs in news. We present two new datasets with a total of\n$6800$ images generated using two different generative models including (1)\nDALL-E 2, and (2) Stable-Diffusion. We are confident that the method proposed\nin this paper can further research on generative models in the field of\ncheapfake detection, and that the resulting datasets can be used to train and\nevaluate new models aimed at detecting cheapfakes. We run a preliminary\nqualitative and quantitative analysis to evaluate the performance of each image\ngeneration model for this task, and evaluate a handful of methods for computing\nimage similarity.\n","authors":["Eivind Moholdt","Sohail Ahmed Khan","Duc-Tien Dang-Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.16611v1.pdf","comment":"ACM International Conference on Content-Based Multimedia Indexing\n (CBMI '23)"},{"id":"http://arxiv.org/abs/2303.05800v2","updated":"2023-08-31T10:09:06Z","published":"2023-03-10T09:09:37Z","title":"Enhancing the accuracies by performing pooling decisions adjacent to the\n output layer","summary":" Learning classification tasks of (2^nx2^n) inputs typically consist of \\le n\n(2x2) max-pooling (MP) operators along the entire feedforward deep\narchitecture. Here we show, using the CIFAR-10 database, that pooling decisions\nadjacent to the last convolutional layer significantly enhance accuracies. In\nparticular, average accuracies of the advanced-VGG with m layers (A-VGGm)\narchitectures are 0.936, 0.940, 0.954, 0.955, and 0.955 for m=6, 8, 14, 13, and\n16, respectively. The results indicate A-VGG8s' accuracy is superior to\nVGG16s', and that the accuracies of A-VGG13 and A-VGG16 are equal, and\ncomparable to that of Wide-ResNet16. In addition, replacing the three fully\nconnected (FC) layers with one FC layer, A-VGG6 and A-VGG14, or with several\nlinear activation FC layers, yielded similar accuracies. These significantly\nenhanced accuracies stem from training the most influential input-output\nroutes, in comparison to the inferior routes selected following multiple MP\ndecisions along the deep architecture. In addition, accuracies are sensitive to\nthe order of the non-commutative MP and average pooling operators adjacent to\nthe output layer, varying the number and location of training routes. The\nresults call for the reexamination of previously proposed deep architectures\nand their accuracies by utilizing the proposed pooling strategy adjacent to the\noutput layer.\n","authors":["Yuval Meir","Yarden Tzach","Ronit D. Gross","Ofek Tevet","Roni Vardi","Ido Kanter"],"pdf_url":"https://arxiv.org/pdf/2303.05800v2.pdf","comment":"29 pages, 3 figures, 1 table, and Supplementary Information"},{"id":"http://arxiv.org/abs/2308.16598v1","updated":"2023-08-31T09:57:27Z","published":"2023-08-31T09:57:27Z","title":"Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation","summary":" Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential\nrole in the early diagnosis and treatment of liver cancer. Deep learning models\nbackboned by fully convolutional neural networks (FCNNs) have become the\ndominant model for segmenting 3D computerized tomography (CT) scans. However,\nsince their convolution layers suffer from limited kernel size, they are not\nable to capture long-range dependencies and global context. To tackle this\nrestriction, vision transformers have been introduced to solve FCNN's locality\nof receptive fields. Although transformers can capture long-range features,\ntheir segmentation performance decreases with various tumor sizes due to the\nmodel sensitivity to the input patch size. While finding an optimal patch size\nimproves the performance of vision transformer-based models on segmentation\ntasks, it is a time-consuming and challenging procedure. This paper proposes a\ntechnique to select the vision transformer's optimal input multi-resolution\nimage patch size based on the average volume size of metastasis lesions. We\nfurther validated our suggested framework using a transfer-learning technique,\ndemonstrating that the highest Dice similarity coefficient (DSC) performance\nwas obtained by pre-training on training data with a larger tumour volume using\nthe suggested ideal patch size and then training with a smaller one. We\nexperimentally evaluate this idea through pre-training our model on a\nmulti-resolution public dataset. Our model showed consistent and improved\nresults when applied to our private multi-resolution mCRC dataset with a\nsmaller average tumor volume. This study lays the groundwork for optimizing\nsemantic segmentation of small objects using vision transformers. The\nimplementation source code is available\nat:https://github.com/Ramtin-Mojtahedi/OVTPS.\n","authors":["Ramtin Mojtahedi","Mohammad Hamghalam","Richard K. G. Do","Amber L. Simpson"],"pdf_url":"https://arxiv.org/pdf/2308.16598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08396v4","updated":"2023-08-31T09:43:37Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with a\nnominal memory and computational burden. The inclusion of multi-axis\nself-attention, within each decoder stage, significantly enhances the\ndiscriminating capacity between the object and background regions, thereby\nhelping in improving the segmentation efficiency. In the Hybrid Decoder block,\nthe fusion process commences by integrating the upsampled lower-level decoder\nfeatures, obtained through transpose convolution, with the skip-connection\nfeatures derived from the hybrid encoder. Subsequently, the fused features\nundergo refinement through the utilization of a multi-axis attention mechanism.\nThe proposed decoder block is repeated multiple times to progressively segment\nthe nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset\ndemonstrates the effectiveness of the proposed technique. Our MaxViT-UNet\noutperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet)\ntechniques by a considerable margin on both of the standard datasets. The\nfollowing github (https://github.com/PRLAB21/MaxViT-UNet) contains the\nimplementation and trained weights.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v4.pdf","comment":"17 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.12751v2","updated":"2023-08-31T09:42:09Z","published":"2023-07-24T12:42:45Z","title":"ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised\n Real-world Single Image Super-Resolution","summary":" Single image super-resolution (SISR) is a challenging ill-posed problem that\naims to up-sample a given low-resolution (LR) image to a high-resolution (HR)\ncounterpart. Due to the difficulty in obtaining real LR-HR training pairs,\nrecent approaches are trained on simulated LR images degraded by simplified\ndown-sampling operators, e.g., bicubic. Such an approach can be problematic in\npractice because of the large gap between the synthesized and real-world LR\nimages. To alleviate the issue, we propose a novel Invertible scale-Conditional\nFunction (ICF), which can scale an input image and then restore the original\ninput with different scale conditions. By leveraging the proposed ICF, we\nconstruct a novel self-supervised SISR framework (ICF-SRSR) to handle the\nreal-world SR task without using any paired/unpaired training data.\nFurthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs,\nwhich can make existing supervised SISR networks more robust. Extensive\nexperiments demonstrate the effectiveness of the proposed method in handling\nSISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior\nperformance compared to the existing methods trained on synthetic paired images\nin real-world scenarios and exhibits comparable performance compared to\nstate-of-the-art supervised/unsupervised methods on public benchmark datasets.\n","authors":["Reyhaneh Neshatavar","Mohsen Yavartanoo","Sanghyun Son","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2307.12751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16582v1","updated":"2023-08-31T09:27:56Z","published":"2023-08-31T09:27:56Z","title":"Any-Size-Diffusion: Toward Efficient Text-Driven Synthesis for Any-Size\n HD Images","summary":" Stable diffusion, a generative model used in text-to-image synthesis,\nfrequently encounters resolution-induced composition problems when generating\nimages of varying sizes. This issue primarily stems from the model being\ntrained on pairs of single-scale images and their corresponding text\ndescriptions. Moreover, direct training on images of unlimited sizes is\nunfeasible, as it would require an immense number of text-image pairs and\nentail substantial computational expenses. To overcome these challenges, we\npropose a two-stage pipeline named Any-Size-Diffusion (ASD), designed to\nefficiently generate well-composed images of any size, while minimizing the\nneed for high-memory GPU resources. Specifically, the initial stage, dubbed Any\nRatio Adaptability Diffusion (ARAD), leverages a selected set of images with a\nrestricted range of ratios to optimize the text-conditional diffusion model,\nthereby improving its ability to adjust composition to accommodate diverse\nimage sizes. To support the creation of images at any desired size, we further\nintroduce a technique called Fast Seamless Tiled Diffusion (FSTD) at the\nsubsequent stage. This method allows for the rapid enlargement of the ASD\noutput to any high-resolution size, avoiding seaming artifacts or memory\noverloads. Experimental results on the LAION-COCO and MM-CelebA-HQ benchmarks\ndemonstrate that ASD can produce well-structured images of arbitrary sizes,\ncutting down the inference time by 2x compared to the traditional tiled\nalgorithm.\n","authors":["Qingping Zheng","Yuanfan Guo","Jiankang Deng","Jianhua Han","Ying Li","Songcen Xu","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.16582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12319v2","updated":"2023-08-31T09:26:57Z","published":"2023-08-23T11:31:38Z","title":"RemovalNet: DNN Fingerprint Removal Attacks","summary":" With the performance of deep neural networks (DNNs) remarkably improving,\nDNNs have been widely used in many areas. Consequently, the DNN model has\nbecome a valuable asset, and its intellectual property is safeguarded by\nownership verification techniques (e.g., DNN fingerprinting). However, the\nfeasibility of the DNN fingerprint removal attack and its potential influence\nremains an open problem. In this paper, we perform the first comprehensive\ninvestigation of DNN fingerprint removal attacks. Generally, the knowledge\ncontained in a DNN model can be categorized into general semantic and\nfingerprint-specific knowledge. To this end, we propose a min-max bilevel\noptimization-based DNN fingerprint removal attack named RemovalNet, to evade\nmodel ownership verification. The lower-level optimization is designed to\nremove fingerprint-specific knowledge. While in the upper-level optimization,\nwe distill the victim model's general semantic knowledge to maintain the\nsurrogate model's performance. We conduct extensive experiments to evaluate the\nfidelity, effectiveness, and efficiency of the RemovalNet against four advanced\ndefense methods on six metrics. The empirical results demonstrate that (1) the\nRemovalNet is effective. After our DNN fingerprint removal attack, the model\ndistance between the target and surrogate models is x100 times higher than that\nof the baseline attacks, (2) the RemovalNet is efficient. It uses only 0.2%\n(400 samples) of the substitute dataset and 1,000 iterations to conduct our\nattack. Besides, compared with advanced model stealing attacks, the RemovalNet\nsaves nearly 85% of computational resources at most, (3) the RemovalNet\nachieves high fidelity that the created surrogate model maintains high accuracy\nafter the DNN fingerprint removal process. Our code is available at:\nhttps://github.com/grasses/RemovalNet.\n","authors":["Hongwei Yao","Zheng Li","Kunzhe Huang","Jian Lou","Zhan Qin","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2308.12319v2.pdf","comment":"some mistake"},{"id":"http://arxiv.org/abs/2308.16576v1","updated":"2023-08-31T09:19:06Z","published":"2023-08-31T09:19:06Z","title":"GHuNeRF: Generalizable Human NeRF from a Monocular Video","summary":" In this paper, we tackle the challenging task of learning a generalizable\nhuman NeRF model from a monocular video. Although existing generalizable human\nNeRFs have achieved impressive results, they require muti-view images or videos\nwhich might not be always available. On the other hand, some works on\nfree-viewpoint rendering of human from monocular videos cannot be generalized\nto unseen identities. In view of these limitations, we propose GHuNeRF to learn\na generalizable human NeRF model from a monocular video of the human performer.\nWe first introduce a visibility-aware aggregation scheme to compute vertex-wise\nfeatures, which is used to construct a 3D feature volume. The feature volume\ncan only represent the overall geometry of the human performer with\ninsufficient accuracy due to the limited resolution. To solve this, we further\nenhance the volume feature with temporally aligned point-wise features using an\nattention mechanism. Finally, the enhanced feature is used for predicting\ndensity and color for each sampled point. A surface-guided sampling strategy is\nalso introduced to improve the efficiency for both training and inference. We\nvalidate our approach on the widely-used ZJU-MoCap dataset, where we achieve\ncomparable performance with existing multi-view video based approaches. We also\ntest on the monocular People-Snapshot dataset and achieve better performance\nthan existing works when only monocular video is used.\n","authors":["Chen Li","Jihao Lin","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2308.16576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16573v1","updated":"2023-08-31T09:13:34Z","published":"2023-08-31T09:13:34Z","title":"Dual-Decoder Consistency via Pseudo-Labels Guided Data Augmentation for\n Semi-Supervised Medical Image Segmentation","summary":" Medical image segmentation methods often rely on fully supervised approaches\nto achieve excellent performance, which is contingent upon having an extensive\nset of labeled images for training. However, annotating medical images is both\nexpensive and time-consuming. Semi-supervised learning offers a solution by\nleveraging numerous unlabeled images alongside a limited set of annotated ones.\nIn this paper, we introduce a semi-supervised medical image segmentation method\nbased on the mean-teacher model, referred to as Dual-Decoder Consistency via\nPseudo-Labels Guided Data Augmentation (DCPA). This method combines consistency\nregularization, pseudo-labels, and data augmentation to enhance the efficacy of\nsemi-supervised segmentation. Firstly, the proposed model comprises both\nstudent and teacher models with a shared encoder and two distinct decoders\nemploying different up-sampling strategies. Minimizing the output discrepancy\nbetween decoders enforces the generation of consistent representations, serving\nas regularization during student model training. Secondly, we introduce mixup\noperations to blend unlabeled data with labeled data, creating mixed data and\nthereby achieving data augmentation. Lastly, pseudo-labels are generated by the\nteacher model and utilized as labels for mixed data to compute unsupervised\nloss. We compare the segmentation results of the DCPA model with six\nstate-of-the-art semi-supervised methods on three publicly available medical\ndatasets. Beyond classical 10\\% and 20\\% semi-supervised settings, we\ninvestigate performance with less supervision (5\\% labeled data). Experimental\noutcomes demonstrate that our approach consistently outperforms existing\nsemi-supervised medical image segmentation methods across the three\nsemi-supervised settings.\n","authors":["Yuanbin Chen","Tao Wang","Hui Tang","Longxuan Zhao","Ruige Zong","Tao Tan","Xinlin Zhang","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2308.16573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16572v1","updated":"2023-08-31T09:13:30Z","published":"2023-08-31T09:13:30Z","title":"CL-MAE: Curriculum-Learned Masked Autoencoders","summary":" Masked image modeling has been demonstrated as a powerful pretext task for\ngenerating robust representations that can be effectively generalized across\nmultiple downstream tasks. Typically, this approach involves randomly masking\npatches (tokens) in input images, with the masking strategy remaining unchanged\nduring training. In this paper, we propose a curriculum learning approach that\nupdates the masking strategy to continually increase the complexity of the\nself-supervised reconstruction task. We conjecture that, by gradually\nincreasing the task complexity, the model can learn more sophisticated and\ntransferable representations. To facilitate this, we introduce a novel\nlearnable masking module that possesses the capability to generate masks of\ndifferent complexities, and integrate the proposed module into masked\nautoencoders (MAE). Our module is jointly trained with the MAE, while adjusting\nits behavior during training, transitioning from a partner to the MAE\n(optimizing the same reconstruction loss) to an adversary (optimizing the\nopposite loss), while passing through a neutral state. The transition between\nthese behaviors is smooth, being regulated by a factor that is multiplied with\nthe reconstruction loss of the masking module. The resulting training procedure\ngenerates an easy-to-hard curriculum. We train our Curriculum-Learned Masked\nAutoencoder (CL-MAE) on ImageNet and show that it exhibits superior\nrepresentation learning capabilities compared to MAE. The empirical results on\nfive downstream tasks confirm our conjecture, demonstrating that curriculum\nlearning can be successfully used to self-supervise masked autoencoders.\n","authors":["Neelu Madan","Nicolae-Catalin Ristea","Kamal Nasrollahi","Thomas B. Moeslund","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2308.16572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16571v1","updated":"2023-08-31T09:12:34Z","published":"2023-08-31T09:12:34Z","title":"Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based\n Approach","summary":" In the rapidly evolving digital era, the analysis of document layouts plays a\npivotal role in automated information extraction and interpretation. In our\nwork, we have trained MViTv2 transformer model architecture with cascaded mask\nR-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from\na document. After training on 20365 document images for 36 epochs in a 3 phase\ncycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work\nextends beyond training, delving into the exploration of potential enhancement\navenues. We investigate the impact of rotation and flip augmentation, the\neffectiveness of slicing input images pre-inference, the implications of\nvarying the resolution of the transformer backbone, and the potential of\nemploying a dual-pass inference to uncover missed text-boxes. Through these\nexplorations, we observe a spectrum of outcomes, where some modifications\nresult in tangible performance improvements, while others offer unique insights\nfor future endeavors.\n","authors":["Ashrafur Rahman Khan","Asif Azad"],"pdf_url":"https://arxiv.org/pdf/2308.16571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05970v2","updated":"2023-08-31T09:11:22Z","published":"2023-05-10T08:28:51Z","title":"FusionBooster: A Unified Image Fusion Boosting Paradigm","summary":" In recent years, numerous ideas have emerged for designing a mutually\nreinforcing mechanism or extra stages for the image fusion task, ignoring the\ninevitable gaps between different vision tasks and the computational burden. We\nargue that there is a scope to improve the fusion performance with the help of\nthe FusionBooster, a model specifically designed for the fusion task. In\nparticular, our booster is based on the divide-and-conquer strategy controlled\nby an information probe. The booster is composed of three building blocks: the\nprobe units, the booster layer, and the assembling module. Given the result\nproduced by a backbone method, the probe units assess the fused image and\ndivide the results according to their information content. This is instrumental\nin identifying missing information, as a step to its recovery. The recovery of\nthe degraded components along with the fusion guidance are the role of the\nbooster layer. Lastly, the assembling module is responsible for piecing these\nadvanced components together to deliver the output. We use concise\nreconstruction loss functions in conjunction with lightweight autoencoder\nmodels to formulate the learning task, with marginal computational complexity\nincrease. The experimental results obtained in various fusion tasks, as well as\ndownstream detection tasks, consistently demonstrate that the proposed\nFusionBooster significantly improves the performance. Our code will be publicly\navailable on the project homepage.\n","authors":["Chunyang Cheng","Tianyang Xu","Xiao-Jun Wu","Hui Li","Xi Li","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2305.05970v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2308.16568v1","updated":"2023-08-31T09:02:53Z","published":"2023-08-31T09:02:53Z","title":"Shape of my heart: Cardiac models through learned signed distance\n functions","summary":" The efficient construction of an anatomical model is one of the major\nchallenges of patient-specific in-silico models of the human heart. Current\nmethods frequently rely on linear statistical models, allowing no advanced\ntopological changes, or requiring medical image segmentation followed by a\nmeshing pipeline, which strongly depends on image resolution, quality, and\nmodality. These approaches are therefore limited in their transferability to\nother imaging domains. In this work, the cardiac shape is reconstructed by\nmeans of three-dimensional deep signed distance functions with Lipschitz\nregularity. For this purpose, the shapes of cardiac MRI reconstructions are\nlearned from public databases to model the spatial relation of multiple\nchambers in Cartesian space. We demonstrate that this approach is also capable\nof reconstructing anatomical models from partial data, such as point clouds\nfrom a single ventricle, or modalities different from the trained MRI, such as\nelectroanatomical mapping, and in addition, allows us to generate new\nanatomical shapes by randomly sampling latent vectors.\n","authors":["Jan Verhülsdonk","Thomas Grandits","Francisco Sahli Costabal","Rolf Krause","Angelo Auricchio","Gundolf Haase","Simone Pezzuto","Alexander Effland"],"pdf_url":"https://arxiv.org/pdf/2308.16568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16567v1","updated":"2023-08-31T09:01:45Z","published":"2023-08-31T09:01:45Z","title":"ScrollNet: Dynamic Weight Importance for Continual Learning","summary":" The principle underlying most existing continual learning (CL) methods is to\nprioritize stability by penalizing changes in parameters crucial to old tasks,\nwhile allowing for plasticity in other parameters. The importance of weights\nfor each task can be determined either explicitly through learning a\ntask-specific mask during training (e.g., parameter isolation-based approaches)\nor implicitly by introducing a regularization term (e.g., regularization-based\napproaches). However, all these methods assume that the importance of weights\nfor each task is unknown prior to data exposure. In this paper, we propose\nScrollNet as a scrolling neural network for continual learning. ScrollNet can\nbe seen as a dynamic network that assigns the ranking of weight importance for\neach task before data exposure, thus achieving a more favorable\nstability-plasticity tradeoff during sequential task learning by reassigning\nthis ranking for different tasks. Additionally, we demonstrate that ScrollNet\ncan be combined with various CL methods, including regularization-based and\nreplay-based approaches. Experimental results on CIFAR100 and TinyImagenet\ndatasets show the effectiveness of our proposed method. We release our code at\nhttps://github.com/FireFYF/ScrollNet.git.\n","authors":["Fei Yang","Kai Wang","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2308.16567v1.pdf","comment":"Accepted at Visual Continual Learning workshop (ICCV2023)"},{"id":"http://arxiv.org/abs/2303.00262v2","updated":"2023-08-31T09:01:35Z","published":"2023-03-01T06:35:42Z","title":"Collage Diffusion","summary":" We seek to give users precise control over diffusion-based image generation\nby modeling complex scenes as sequences of layers, which define the desired\nspatial arrangement and visual attributes of objects in the scene. Collage\nDiffusion harmonizes the input layers to make objects fit together -- the key\nchallenge involves minimizing changes in the positions and key visual\nattributes of the input layers while allowing other attributes to change in the\nharmonization process. We ensure that objects are generated in the correct\nlocations by modifying text-image cross-attention with the layers' alpha masks.\nWe preserve key visual attributes of input layers by learning specialized text\nrepresentations per layer and by extending ControlNet to operate on layers.\nLayer input allows users to control the extent of image harmonization on a\nper-object basis, and users can even iteratively edit individual objects in\ngenerated images while keeping other objects fixed. By leveraging the rich\ninformation present in layer input, Collage Diffusion generates globally\nharmonized images that maintain desired object characteristics better than\nprior approaches.\n","authors":["Vishnu Sarukkai","Linden Li","Arden Ma","Christopher Ré","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2303.00262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16561v1","updated":"2023-08-31T08:54:59Z","published":"2023-08-31T08:54:59Z","title":"MoMA: Momentum Contrastive Learning with Multi-head Attention-based\n Knowledge Distillation for Histopathology Image Analysis","summary":" There is no doubt that advanced artificial intelligence models and high\nquality data are the keys to success in developing computational pathology\ntools. Although the overall volume of pathology data keeps increasing, a lack\nof quality data is a common issue when it comes to a specific task due to\nseveral reasons including privacy and ethical issues with patient data. In this\nwork, we propose to exploit knowledge distillation, i.e., utilize the existing\nmodel to learn a new, target model, to overcome such issues in computational\npathology. Specifically, we employ a student-teacher framework to learn a\ntarget model from a pre-trained, teacher model without direct access to source\ndata and distill relevant knowledge via momentum contrastive learning with\nmulti-head attention mechanism, which provides consistent and context-aware\nfeature representations. This enables the target model to assimilate\ninformative representations of the teacher model while seamlessly adapting to\nthe unique nuances of the target data. The proposed method is rigorously\nevaluated across different scenarios where the teacher model was trained on the\nsame, relevant, and irrelevant classification tasks with the target model.\nExperimental results demonstrate the accuracy and robustness of our approach in\ntransferring knowledge to different domains and tasks, outperforming other\nrelated methods. Moreover, the results provide a guideline on the learning\nstrategy for different types of tasks and scenarios in computational pathology.\nCode is available at: \\url{https://github.com/trinhvg/MoMA}.\n","authors":["Trinh Thi Le Vuong","Jin Tae Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.16561v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.16555v1","updated":"2023-08-31T08:46:12Z","published":"2023-08-31T08:46:12Z","title":"E3CM: Epipolar-Constrained Cascade Correspondence Matching","summary":" Accurate and robust correspondence matching is of utmost importance for\nvarious 3D computer vision tasks. However, traditional explicit\nprogramming-based methods often struggle to handle challenging scenarios, and\ndeep learning-based methods require large well-labeled datasets for network\ntraining. In this article, we introduce Epipolar-Constrained Cascade\nCorrespondence (E3CM), a novel approach that addresses these limitations.\nUnlike traditional methods, E3CM leverages pre-trained convolutional neural\nnetworks to match correspondence, without requiring annotated data for any\nnetwork training or fine-tuning. Our method utilizes epipolar constraints to\nguide the matching process and incorporates a cascade structure for progressive\nrefinement of matches. We extensively evaluate the performance of E3CM through\ncomprehensive experiments and demonstrate its superiority over existing\nmethods. To promote further research and facilitate reproducibility, we make\nour source code publicly available at https://mias.group/E3CM.\n","authors":["Chenbo Zhou","Shuai Su","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2308.16555v1.pdf","comment":"accepted to Neurocomputing"},{"id":"http://arxiv.org/abs/2308.16552v1","updated":"2023-08-31T08:43:52Z","published":"2023-08-31T08:43:52Z","title":"Prompt-enhanced Hierarchical Transformer Elevating Cardiopulmonary\n Resuscitation Instruction via Temporal Action Segmentation","summary":" The vast majority of people who suffer unexpected cardiac arrest are\nperformed cardiopulmonary resuscitation (CPR) by passersby in a desperate\nattempt to restore life, but endeavors turn out to be fruitless on account of\ndisqualification. Fortunately, many pieces of research manifest that\ndisciplined training will help to elevate the success rate of resuscitation,\nwhich constantly desires a seamless combination of novel techniques to yield\nfurther advancement. To this end, we collect a custom CPR video dataset in\nwhich trainees make efforts to behave resuscitation on mannequins independently\nin adherence to approved guidelines, thereby devising an auxiliary toolbox to\nassist supervision and rectification of intermediate potential issues via\nmodern deep learning methodologies. Our research empirically views this problem\nas a temporal action segmentation (TAS) task in computer vision, which aims to\nsegment an untrimmed video at a frame-wise level. Here, we propose a\nPrompt-enhanced hierarchical Transformer (PhiTrans) that integrates three\nindispensable modules, including a textual prompt-based Video Features\nExtractor (VFE), a transformer-based Action Segmentation Executor (ASE), and a\nregression-based Prediction Refinement Calibrator (PRC). The backbone of the\nmodel preferentially derives from applications in three approved public\ndatasets (GTEA, 50Salads, and Breakfast) collected for TAS tasks, which\naccounts for the excavation of the segmentation pipeline on the CPR dataset. In\ngeneral, we unprecedentedly probe into a feasible pipeline that genuinely\nelevates the CPR instruction qualification via action segmentation in\nconjunction with cutting-edge deep learning techniques. Associated experiments\nadvocate our implementation with multiple metrics surpassing 91.0%.\n","authors":["Yang Liu","Xiaoyun Zhong","Shiyao Zhai","Zhicheng Du","Zhenyuan Gao","Qiming Huang","Canyang Zhang","Bin Jiang","Vijay Kumar Pandey","Sanyang Han","Runming Wang","Yuxing Han","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2308.16552v1.pdf","comment":"Transformer for Cardiopulmonary Resuscitation"},{"id":"http://arxiv.org/abs/2308.16551v1","updated":"2023-08-31T08:43:21Z","published":"2023-08-31T08:43:21Z","title":"Object Detection for Caries or Pit and Fissure Sealing Requirement in\n Children's First Permanent Molars","summary":" Dental caries is one of the most common oral diseases that, if left\nuntreated, can lead to a variety of oral problems. It mainly occurs inside the\npits and fissures on the occlusal/buccal/palatal surfaces of molars and\nchildren are a high-risk group for pit and fissure caries in permanent molars.\nPit and fissure sealing is one of the most effective methods that is widely\nused in prevention of pit and fissure caries. However, current detection of\npits and fissures or caries depends primarily on the experienced dentists,\nwhich ordinary parents do not have, and children may miss the remedial\ntreatment without timely detection. To address this issue, we present a method\nto autodetect caries and pit and fissure sealing requirements using oral photos\ntaken by smartphones. We use the YOLOv5 and YOLOX models and adopt a tiling\nstrategy to reduce information loss during image pre-processing. The best\nresult for YOLOXs model with tiling strategy is 72.3 mAP.5, while the best\nresult without tiling strategy is 71.2. YOLOv5s6 model with/without tiling\nattains 70.9/67.9 mAP.5, respectively. We deploy the pre-trained network to\nmobile devices as a WeChat applet, allowing in-home detection by parents or\nchildren guardian.\n","authors":["Chenyao Jiang","Shiyao Zhai","Hengrui Song","Yuqing Ma","Yachen Fan","Yancheng Fang","Dongmei Yu","Canyang Zhang","Sanyang Han","Runming Wang","Yong Liu","Jianbo Li","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2308.16551v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05102v2","updated":"2023-08-31T08:43:17Z","published":"2023-03-09T08:21:50Z","title":"StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent\n Disentangled Space","summary":" One major challenge in machine learning applications is coping with\nmismatches between the datasets used in the development and those obtained in\nreal-world applications. These mismatches may lead to inaccurate predictions\nand errors, resulting in poor product quality and unreliable systems. In this\nstudy, we propose StyleDiff to inform developers of the differences between the\ntwo datasets for the steady development of machine learning systems. Using\ndisentangled image spaces obtained from recently proposed generative models,\nStyleDiff compares the two datasets by focusing on attributes in the images and\nprovides an easy-to-understand analysis of the differences between the\ndatasets. The proposed StyleDiff performs in $O (d N\\log N)$, where $N$ is the\nsize of the datasets and $d$ is the number of attributes, enabling the\napplication to large datasets. We demonstrate that StyleDiff accurately detects\ndifferences between datasets and presents them in an understandable format\nusing, for example, driving scenes datasets.\n","authors":["Keisuke Kawano","Takuro Kutsuna","Ryoko Tokuhisa","Akihiro Nakamura","Yasushi Esaki"],"pdf_url":"https://arxiv.org/pdf/2303.05102v2.pdf","comment":"25 pages, 17 figures, Image and Vision Computing"},{"id":"http://arxiv.org/abs/2308.16532v1","updated":"2023-08-31T08:21:29Z","published":"2023-08-31T08:21:29Z","title":"Decoupled Local Aggregation for Point Cloud Learning","summary":" The unstructured nature of point clouds demands that local aggregation be\nadaptive to different local structures. Previous methods meet this by\nexplicitly embedding spatial relations into each aggregation process. Although\nthis coupled approach has been shown effective in generating clear semantics,\naggregation can be greatly slowed down due to repeated relation learning and\nredundant computation to mix directional and point features. In this work, we\npropose to decouple the explicit modelling of spatial relations from local\naggregation. We theoretically prove that basic neighbor pooling operations can\ntoo function without loss of clarity in feature fusion, so long as essential\nspatial information has been encoded in point features. As an instantiation of\ndecoupled local aggregation, we present DeLA, a lightweight point network,\nwhere in each learning stage relative spatial encodings are first formed, and\nonly pointwise convolutions plus edge max-pooling are used for local\naggregation then. Further, a regularization term is employed to reduce\npotential ambiguity through the prediction of relative coordinates.\nConceptually simple though, experimental results on five classic benchmarks\ndemonstrate that DeLA achieves state-of-the-art performance with reduced or\ncomparable latency. Specifically, DeLA achieves over 90\\% overall accuracy on\nScanObjectNN and 74\\% mIoU on S3DIS Area 5. Our code is available at\nhttps://github.com/Matrix-ASC/DeLA .\n","authors":["Binjie Chen","Yunzhou Xia","Yu Zang","Cheng Wang","Jonathan Li"],"pdf_url":"https://arxiv.org/pdf/2308.16532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16530v1","updated":"2023-08-31T08:21:09Z","published":"2023-08-31T08:21:09Z","title":"Privacy-Preserving Medical Image Classification through Deep Learning\n and Matrix Decomposition","summary":" Deep learning (DL)-based solutions have been extensively researched in the\nmedical domain in recent years, enhancing the efficacy of diagnosis, planning,\nand treatment. Since the usage of health-related data is strictly regulated,\nprocessing medical records outside the hospital environment for developing and\nusing DL models demands robust data protection measures. At the same time, it\ncan be challenging to guarantee that a DL solution delivers a minimum level of\nperformance when being trained on secured data, without being specifically\ndesigned for the given task. Our approach uses singular value decomposition\n(SVD) and principal component analysis (PCA) to obfuscate the medical images\nbefore employing them in the DL analysis. The capability of DL algorithms to\nextract relevant information from secured data is assessed on a task of\nangiographic view classification based on obfuscated frames. The security level\nis probed by simulated artificial intelligence (AI)-based reconstruction\nattacks, considering two threat actors with different prior knowledge of the\ntargeted data. The degree of privacy is quantitatively measured using\nsimilarity indices. Although a trade-off between privacy and accuracy should be\nconsidered, the proposed technique allows for training the angiographic view\nclassifier exclusively on secured data with satisfactory performance and with\nno computational overhead, model adaptation, or hyperparameter tuning. While\nthe obfuscated medical image content is well protected against human\nperception, the hypothetical reconstruction attack proved that it is also\ndifficult to recover the complete information of the original frames.\n","authors":["Andreea Bianca Popescu","Cosmin Ioan Nita","Ioana Antonia Taca","Anamaria Vizitiu","Lucian Mihai Itu"],"pdf_url":"https://arxiv.org/pdf/2308.16530v1.pdf","comment":"6 pages, 9 figures, Published in: 2023 31st Mediterranean Conference\n on Control and Automation (MED)"},{"id":"http://arxiv.org/abs/2308.16528v1","updated":"2023-08-31T08:19:26Z","published":"2023-08-31T08:19:26Z","title":"SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded\n Objects","summary":" To enable meaningful robotic manipulation of objects in the real-world, 6D\npose estimation is one of the critical aspects. Most existing approaches have\ndifficulties to extend predictions to scenarios where novel object instances\nare continuously introduced, especially with heavy occlusions. In this work, we\npropose a few-shot pose estimation (FSPE) approach called SA6D, which uses a\nself-adaptive segmentation module to identify the novel target object and\nconstruct a point cloud model of the target object using only a small number of\ncluttered reference images. Unlike existing methods, SA6D does not require\nobject-centric reference images or any additional object information, making it\na more generalizable and scalable solution across categories. We evaluate SA6D\non real-world tabletop object datasets and demonstrate that SA6D outperforms\nexisting FSPE methods, particularly in cluttered scenes with occlusions, while\nrequiring fewer reference images.\n","authors":["Ning Gao","Ngo Anh Vien","Hanna Ziesche","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08566v2","updated":"2023-08-31T08:17:57Z","published":"2023-03-15T12:34:24Z","title":"Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning","summary":" Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful\nalternative for full fine-tuning so as to adapt pre-trained vision models to\ndownstream tasks, which only tunes a small number of parameters while freezing\nthe vast majority ones to ease storage burden and optimization difficulty.\nHowever, existing PEFT methods introduce trainable parameters to the same\npositions across different tasks depending solely on human heuristics and\nneglect the domain gaps. To this end, we study where to introduce and how to\nallocate trainable parameters by proposing a novel Sensitivity-aware visual\nParameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates\ntrainable parameters to task-specific important positions given a desired\ntunable parameter budget. Specifically, our SPT first quickly identifies the\nsensitive parameters that require tuning for a given task in a data-dependent\nway. Next, our SPT further boosts the representational capability for the\nweight matrices whose number of sensitive parameters exceeds a pre-defined\nthreshold by utilizing existing structured tuning methods, e.g., LoRA [23] or\nAdapter [22], to replace directly tuning the selected sensitive parameters\n(unstructured tuning) under the budget. Extensive experiments on a wide range\nof downstream recognition tasks show that our SPT is complementary to the\nexisting PEFT methods and largely boosts their performance, e.g., SPT improves\nAdapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean\nTop-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks,\nrespectively. Source code is at https://github.com/ziplab/SPT\n","authors":["Haoyu He","Jianfei Cai","Jing Zhang","Dacheng Tao","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2303.08566v2.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2308.16527v1","updated":"2023-08-31T08:17:29Z","published":"2023-08-31T08:17:29Z","title":"Unsupervised Recognition of Unknown Objects for Open-World Object\n Detection","summary":" Open-World Object Detection (OWOD) extends object detection problem to a\nrealistic and dynamic scenario, where a detection model is required to be\ncapable of detecting both known and unknown objects and incrementally learning\nnewly introduced knowledge. Current OWOD models, such as ORE and OW-DETR, focus\non pseudo-labeling regions with high objectness scores as unknowns, whose\nperformance relies heavily on the supervision of known objects. While they can\ndetect the unknowns that exhibit similar features to the known objects, they\nsuffer from a severe label bias problem that they tend to detect all regions\n(including unknown object regions) that are dissimilar to the known objects as\npart of the background. To eliminate the label bias, this paper proposes a\nnovel approach that learns an unsupervised discriminative model to recognize\ntrue unknown objects from raw pseudo labels generated by unsupervised region\nproposal methods. The resulting model can be further refined by a\nclassification-free self-training method which iteratively extends pseudo\nunknown objects to the unlabeled regions. Experimental results show that our\nmethod 1) significantly outperforms the prior SOTA in detecting unknown objects\nwhile maintaining competitive performance of detecting known object classes on\nthe MS COCO dataset, and 2) achieves better generalization ability on the LVIS\nand Objects365 datasets.\n","authors":["Ruohuan Fang","Guansong Pang","Lei Zhou","Xiao Bai","Jin Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.16527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15816v2","updated":"2023-08-31T08:14:17Z","published":"2023-08-30T07:41:26Z","title":"Improving Underwater Visual Tracking With a Large Scale Dataset and\n Image Enhancement","summary":" This paper presents a new dataset and general tracker enhancement method for\nUnderwater Visual Object Tracking (UVOT). Despite its significance, underwater\ntracking has remained unexplored due to data inaccessibility. It poses distinct\nchallenges; the underwater environment exhibits non-uniform lighting\nconditions, low visibility, lack of sharpness, low contrast, camouflage, and\nreflections from suspended particles. Performance of traditional tracking\nmethods designed primarily for terrestrial or open-air scenarios drops in such\nconditions. We address the problem by proposing a novel underwater image\nenhancement algorithm designed specifically to boost tracking quality. The\nmethod has resulted in a significant performance improvement, of up to 5.0%\nAUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate\nUVOT methods, large-scale datasets are required. To this end, we introduce a\nlarge-scale UVOT benchmark dataset consisting of 400 video segments and 275,000\nmanually annotated frames enabling underwater training and evaluation of deep\ntrackers. The videos are labelled with several underwater-specific tracking\nattributes including watercolor variation, target distractors, camouflage,\ntarget relative size, and low visibility conditions. The UVOT400 dataset,\ntracking results, and the code are publicly available on:\nhttps://github.com/BasitAlawode/UWVOT400.\n","authors":["Basit Alawode","Fayaz Ali Dharejo","Mehnaz Ummar","Yuhang Guo","Arif Mahmood","Naoufel Werghi","Fahad Shahbaz Khan","Jiri Matas","Sajid Javed"],"pdf_url":"https://arxiv.org/pdf/2308.15816v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16518v1","updated":"2023-08-31T08:03:25Z","published":"2023-08-31T08:03:25Z","title":"MS23D: A 3D Object Detection Method Using Multi-Scale Semantic Feature\n Points to Construct 3D Feature Layers","summary":" Lidar point clouds, as a type of data with accurate distance perception, can\neffectively represent the motion and posture of objects in three-dimensional\nspace. However, the sparsity and disorderliness of point clouds make it\nchallenging to extract features directly from them. Many studies have addressed\nthis issue by transforming point clouds into regular voxel representations.\nHowever, these methods often lead to the loss of fine-grained local feature\ninformation due to downsampling. Moreover, the sparsity of point clouds poses\ndifficulties in efficiently aggregating features in 3D feature layers using\nvoxel-based two-stage methods. To address these issues, this paper proposes a\ntwo-stage 3D detection framework called MS$^{2}$3D. In MS$^{2}$3D, we utilize\nsmall-sized voxels to extract fine-grained local features and large-sized\nvoxels to capture long-range local features. Additionally, we propose a method\nfor constructing 3D feature layers using multi-scale semantic feature points,\nenabling the transformation of sparse 3D feature layers into more compact\nrepresentations. Furthermore, we compute the offset between feature points in\nthe 3D feature layers and the centroid of objects, aiming to bring them as\nclose as possible to the object's center. It significantly enhances the\nefficiency of feature aggregation. To validate the effectiveness of our method,\nwe evaluated our method on the KITTI dataset and ONCE dataset together.\n","authors":["Yongxin Shao","Aihong Tan","Tianhong Yan","Zhetao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.16518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06791v2","updated":"2023-08-31T07:49:41Z","published":"2023-08-13T15:30:02Z","title":"PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D\n Object Detector","summary":" LIDAR-based 3D object detection and classification is crucial for autonomous\ndriving. However, inference in real-time from extremely sparse 3D data poses a\nformidable challenge. To address this issue, a common approach is to project\npoint clouds onto a bird's-eye or perspective view, effectively converting them\ninto an image-like data format. However, this excessive compression of point\ncloud data often leads to the loss of information. This paper proposes a 3D\nobject detector based on voxel and projection double branch feature extraction\n(PV-SSD) to address the problem of information loss. We add voxel features\ninput containing rich local semantic information, which is fully fused with the\nprojected features in the feature extraction stage to reduce the local\ninformation loss caused by projection. A good performance is achieved compared\nto the previous work. In addition, this paper makes the following\ncontributions: 1) a voxel feature extraction method with variable receptive\nfields is proposed; 2) a feature point sampling method by weight sampling is\nused to filter out the feature points that are more conducive to the detection\ntask; 3) the MSSFA module is proposed based on the SSFA module. To verify the\neffectiveness of our method, we designed comparison experiments.\n","authors":["Yongxin Shao","Aihong Tan","Zhetao Sun","Enhui Zheng","Tianhong Yan"],"pdf_url":"https://arxiv.org/pdf/2308.06791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16512v1","updated":"2023-08-31T07:49:06Z","published":"2023-08-31T07:49:06Z","title":"MVDream: Multi-view Diffusion for 3D Generation","summary":" We propose MVDream, a multi-view diffusion model that is able to generate\ngeometrically consistent multi-view images from a given text prompt. By\nleveraging image diffusion models pre-trained on large-scale web datasets and a\nmulti-view dataset rendered from 3D assets, the resulting multi-view diffusion\nmodel can achieve both the generalizability of 2D diffusion and the consistency\nof 3D data. Such a model can thus be applied as a multi-view prior for 3D\ngeneration via Score Distillation Sampling, where it greatly improves the\nstability of existing 2D-lifting methods by solving the 3D consistency problem.\nFinally, we show that the multi-view diffusion model can also be fine-tuned\nunder a few shot setting for personalized 3D generation, i.e. DreamBooth3D\napplication, where the consistency can be maintained after learning the subject\nidentity.\n","authors":["Yichun Shi","Peng Wang","Jianglong Ye","Mai Long","Kejie Li","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16512v1.pdf","comment":"Our project page is https://MV-Dream.github.io"},{"id":"http://arxiv.org/abs/2308.16510v1","updated":"2023-08-31T07:47:11Z","published":"2023-08-31T07:47:11Z","title":"Robust GAN inversion","summary":" Recent advancements in real image editing have been attributed to the\nexploration of Generative Adversarial Networks (GANs) latent space. However,\nthe main challenge of this procedure is GAN inversion, which aims to map the\nimage to the latent space accurately. Existing methods that work on extended\nlatent space $W+$ are unable to achieve low distortion and high editability\nsimultaneously. To address this issue, we propose an approach which works in\nnative latent space $W$ and tunes the generator network to restore missing\nimage details. We introduce a novel regularization strategy with learnable\ncoefficients obtained by training randomized StyleGAN 2 model - WRanGAN. This\nmethod outperforms traditional approaches in terms of reconstruction quality\nand computational efficiency, achieving the lowest distortion with 4 times\nfewer parameters. Furthermore, we observe a slight improvement in the quality\nof constructing hyperplanes corresponding to binary image attributes. We\ndemonstrate the effectiveness of our approach on two complex datasets:\nFlickr-Faces-HQ and LSUN Church.\n","authors":["Egor Sevriugov","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2308.16510v1.pdf","comment":"22 pages, 28 figures"},{"id":"http://arxiv.org/abs/2308.09331v2","updated":"2023-08-31T07:45:59Z","published":"2023-08-18T06:26:22Z","title":"SAMedOCT: Adapting Segment Anything Model (SAM) for Retinal OCT","summary":" The Segment Anything Model (SAM) has gained significant attention in the\nfield of image segmentation due to its impressive capabilities and prompt-based\ninterface. While SAM has already been extensively evaluated in various domains,\nits adaptation to retinal OCT scans remains unexplored. To bridge this research\ngap, we conduct a comprehensive evaluation of SAM and its adaptations on a\nlarge-scale public dataset of OCTs from RETOUCH challenge. Our evaluation\ncovers diverse retinal diseases, fluid compartments, and device vendors,\ncomparing SAM against state-of-the-art retinal fluid segmentation methods.\nThrough our analysis, we showcase adapted SAM's efficacy as a powerful\nsegmentation model in retinal OCT scans, although still lagging behind\nestablished methods in some circumstances. The findings highlight SAM's\nadaptability and robustness, showcasing its utility as a valuable tool in\nretinal OCT image analysis and paving the way for further advancements in this\ndomain.\n","authors":["Botond Fazekas","José Morano","Dmitrii Lachinov","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2308.09331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10161v2","updated":"2023-08-31T07:38:50Z","published":"2023-08-20T04:34:30Z","title":"ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under\n Challenging Conditions","summary":" Robust 3D object detection in extreme weather and illumination conditions is\na challenging task. While radars and thermal cameras are known for their\nresilience to these conditions, few studies have been conducted on\nradar-thermal fusion due to the lack of corresponding datasets. To address this\ngap, we first present a new multi-modal dataset called ThermRad, which includes\na 3D LiDAR, a 4D radar, an RGB camera and a thermal camera. This dataset is\nunique because it includes data from all four sensors in extreme weather\nconditions, providing a valuable resource for future research in this area. To\nvalidate the robustness of 4D radars and thermal cameras for 3D object\ndetection in challenging weather conditions, we propose a new multi-modal\nfusion method called RTDF-RCNN, which leverages the complementary strengths of\n4D radars and thermal cameras to boost object detection performance. To further\nprove the effectiveness of our proposed framework, we re-implement\nstate-of-the-art (SOTA) 3D detectors on our dataset as benchmarks for\nevaluation. Our method achieves significant enhancements in detecting cars,\npedestrians, and cyclists, with improvements of over 7.98%, 24.27%, and 27.15%,\nrespectively, while achieving comparable results to LiDAR-based approaches. Our\ncontributions in both the ThermRad dataset and the new multi-modal fusion\nmethod provide a new approach to robust 3D object detection in adverse weather\nand illumination conditions. The ThermRad dataset will be released.\n","authors":["Qiao Yan","Yihan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10161v2.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2303.06681v3","updated":"2023-08-31T07:26:59Z","published":"2023-03-12T14:54:22Z","title":"Learning Deep Intensity Field for Extremely Sparse-View CBCT\n Reconstruction","summary":" Sparse-view cone-beam CT (CBCT) reconstruction is an important direction to\nreduce radiation dose and benefit clinical applications. Previous voxel-based\ngeneration methods represent the CT as discrete voxels, resulting in high\nmemory requirements and limited spatial resolution due to the use of 3D\ndecoders. In this paper, we formulate the CT volume as a continuous intensity\nfield and develop a novel DIF-Net to perform high-quality CBCT reconstruction\nfrom extremely sparse (fewer than 10) projection views at an ultrafast speed.\nThe intensity field of a CT can be regarded as a continuous function of 3D\nspatial points. Therefore, the reconstruction can be reformulated as regressing\nthe intensity value of an arbitrary 3D point from given sparse projections.\nSpecifically, for a point, DIF-Net extracts its view-specific features from\ndifferent 2D projection views. These features are subsequently aggregated by a\nfusion module for intensity estimation. Notably, thousands of points can be\nprocessed in parallel to improve efficiency during training and testing. In\npractice, we collect a knee CBCT dataset to train and evaluate DIF-Net.\nExtensive experiments show that our approach can reconstruct CBCT with high\nimage quality and high spatial resolution from extremely sparse views within\n1.6 seconds, significantly outperforming state-of-the-art methods. Our code\nwill be available at https://github.com/xmed-lab/DIF-Net.\n","authors":["Yiqun Lin","Zhongjin Luo","Wei Zhao","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2303.06681v3.pdf","comment":"MICCAI'23"},{"id":"http://arxiv.org/abs/2308.16139v2","updated":"2023-08-31T07:26:50Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Arcot Sowmya","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.10280v2","updated":"2023-08-31T07:23:56Z","published":"2023-08-20T14:27:28Z","title":"MacFormer: Map-Agent Coupled Transformer for Real-time and Robust\n Trajectory Prediction","summary":" Predicting the future behavior of agents is a fundamental task in autonomous\nvehicle domains. Accurate prediction relies on comprehending the surrounding\nmap, which significantly regularizes agent behaviors. However, existing methods\nhave limitations in exploiting the map and exhibit a strong dependence on\nhistorical trajectories, which yield unsatisfactory prediction performance and\nrobustness. Additionally, their heavy network architectures impede real-time\napplications. To tackle these problems, we propose Map-Agent Coupled\nTransformer (MacFormer) for real-time and robust trajectory prediction. Our\nframework explicitly incorporates map constraints into the network via two\ncarefully designed modules named coupled map and reference extractor. A novel\nmulti-task optimization strategy (MTOS) is presented to enhance learning of\ntopology and rule constraints. We also devise bilateral query scheme in context\nfusion for a more efficient and lightweight network. We evaluated our approach\non Argoverse 1, Argoverse 2, and nuScenes real-world benchmarks, where it all\nachieved state-of-the-art performance with the lowest inference latency and\nsmallest model size. Experiments also demonstrate that our framework is\nresilient to imperfect tracklet inputs. Furthermore, we show that by combining\nwith our proposed strategies, classical models outperform their baselines,\nfurther validating the versatility of our framework.\n","authors":["Chen Feng","Hangning Zhou","Huadong Lin","Zhigang Zhang","Ziyao Xu","Chi Zhang","Boyu Zhou","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2308.10280v2.pdf","comment":"Accepted by IEEE Robotics and Automation Letters. 8 Pages, 9 Figures,\n 9 Tables. Video: https://www.youtube.com/watch?v=XY388iI6sPQ"},{"id":"http://arxiv.org/abs/2305.15777v2","updated":"2023-08-31T07:20:34Z","published":"2023-05-25T06:44:43Z","title":"Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation","summary":" Medical image data are often limited due to the expensive acquisition and\nannotation process. Hence, training a deep-learning model with only raw data\ncan easily lead to overfitting. One solution to this problem is to augment the\nraw data with various transformations, improving the model's ability to\ngeneralize to new data. However, manually configuring a generic augmentation\ncombination and parameters for different datasets is non-trivial due to\ninconsistent acquisition approaches and data distributions. Therefore,\nautomatic data augmentation is proposed to learn favorable augmentation\nstrategies for different datasets while incurring large GPU overhead. To this\nend, we present a novel method, called Dynamic Data Augmentation (DDAug), which\nis efficient and has negligible computation cost. Our DDAug develops a\nhierarchical tree structure to represent various augmentations and utilizes an\nefficient Monte-Carlo tree searching algorithm to update, prune, and sample the\ntree. As a result, the augmentation pipeline can be optimized for each dataset\nautomatically. Experiments on multiple Prostate MRI datasets show that our\nmethod outperforms the current state-of-the-art data augmentation strategies.\n","authors":["Xinyue Xu","Yuhan Hsi","Haonan Wang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2305.15777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16490v1","updated":"2023-08-31T06:52:43Z","published":"2023-08-31T06:52:43Z","title":"Latent Painter","summary":" Latent diffusers revolutionized the generative AI and inspired creative art.\nWhen denoising the latent, the predicted original image at each step\ncollectively animates the formation. However, the animation is limited by the\ndenoising nature of the diffuser, and only renders a sharpening process. This\nwork presents Latent Painter, which uses the latent as the canvas, and the\ndiffuser predictions as the plan, to generate painting animation. Latent\nPainter also transits one generated image to another, which can happen between\nimages from two different sets of checkpoints.\n","authors":["Shih-Chieh Su"],"pdf_url":"https://arxiv.org/pdf/2308.16490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16486v1","updated":"2023-08-31T06:45:56Z","published":"2023-08-31T06:45:56Z","title":"Illumination Distillation Framework for Nighttime Person\n Re-Identification and A New Benchmark","summary":" Nighttime person Re-ID (person re-identification in the nighttime) is a very\nimportant and challenging task for visual surveillance but it has not been\nthoroughly investigated. Under the low illumination condition, the performance\nof person Re-ID methods usually sharply deteriorates. To address the low\nillumination challenge in nighttime person Re-ID, this paper proposes an\nIllumination Distillation Framework (IDF), which utilizes illumination\nenhancement and illumination distillation schemes to promote the learning of\nRe-ID models. Specifically, IDF consists of a master branch, an illumination\nenhancement branch, and an illumination distillation module. The master branch\nis used to extract the features from a nighttime image. The illumination\nenhancement branch first estimates an enhanced image from the nighttime image\nusing a nonlinear curve mapping method and then extracts the enhanced features.\nHowever, nighttime and enhanced features usually contain data noise due to\nunstable lighting conditions and enhancement failures. To fully exploit the\ncomplementary benefits of nighttime and enhanced features while suppressing\ndata noise, we propose an illumination distillation module. In particular, the\nillumination distillation module fuses the features from two branches through a\nbottleneck fusion model and then uses the fused features to guide the learning\nof both branches in a distillation manner. In addition, we build a real-world\nnighttime person Re-ID dataset, named Night600, which contains 600 identities\ncaptured from different viewpoints and nighttime illumination conditions under\ncomplex outdoor environments. Experimental results demonstrate that our IDF can\nachieve state-of-the-art performance on two nighttime person Re-ID datasets\n(i.e., Night600 and Knight ). We will release our code and dataset at\nhttps://github.com/Alexadlu/IDF.\n","authors":["Andong Lu","Zhang Zhang","Yan Huang","Yifan Zhang","Chenglong Li","Jin Tang","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16486v1.pdf","comment":"Accepted by TMM"},{"id":"http://arxiv.org/abs/2308.16484v1","updated":"2023-08-31T06:44:59Z","published":"2023-08-31T06:44:59Z","title":"Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning","summary":" Affordable 3D scanners often produce sparse and non-uniform point clouds that\nnegatively impact downstream applications in robotic systems. While existing\npoint cloud upsampling architectures have demonstrated promising results on\nstandard benchmarks, they tend to experience significant performance drops when\nthe test data have different distributions from the training data. To address\nthis issue, this paper proposes a test-time adaption approach to enhance model\ngenerality of point cloud upsampling. The proposed approach leverages\nmeta-learning to explicitly learn network parameters for test-time adaption.\nOur method does not require any prior information about the test data. During\nmeta-training, the model parameters are learned from a collection of\ninstance-level tasks, each of which consists of a sparse-dense pair of point\nclouds from the training data. During meta-testing, the trained model is\nfine-tuned with a few gradient updates to produce a unique set of network\nparameters for each test instance. The updated model is then used for the final\nprediction. Our framework is generic and can be applied in a plug-and-play\nmanner with existing backbone networks in point cloud upsampling. Extensive\nexperiments demonstrate that our approach improves the performance of\nstate-of-the-art models.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16481v1","updated":"2023-08-31T06:32:11Z","published":"2023-08-31T06:32:11Z","title":"Point-TTA: Test-Time Adaptation for Point Cloud Registration Using\n Multitask Meta-Auxiliary Learning","summary":" We present Point-TTA, a novel test-time adaptation framework for point cloud\nregistration (PCR) that improves the generalization and the performance of\nregistration models. While learning-based approaches have achieved impressive\nprogress, generalization to unknown testing environments remains a major\nchallenge due to the variations in 3D scans. Existing methods typically train a\ngeneric model and the same trained model is applied on each instance during\ntesting. This could be sub-optimal since it is difficult for the same model to\nhandle all the variations during testing. In this paper, we propose a test-time\nadaptation approach for PCR. Our model can adapt to unseen distributions at\ntest-time without requiring any prior knowledge of the test data. Concretely,\nwe design three self-supervised auxiliary tasks that are optimized jointly with\nthe primary PCR task. Given a test instance, we adapt our model using these\nauxiliary tasks and the updated model is used to perform the inference. During\ntraining, our model is trained using a meta-auxiliary learning approach, such\nthat the adapted model via auxiliary tasks improves the accuracy of the primary\ntask. Experimental results demonstrate the effectiveness of our approach in\nimproving generalization of point cloud registration and outperforming other\nstate-of-the-art approaches.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15142v2","updated":"2023-08-31T05:48:06Z","published":"2023-06-27T02:03:46Z","title":"LRANet: Towards Accurate and Efficient Scene Text Detection with\n Low-Rank Approximation Network","summary":" Recently, regression-based methods, which predict parameterized text shapes\nfor text localization, have gained popularity in scene text detection. However,\nthe existing parameterized text shape methods still have limitations in\nmodeling arbitrary-shaped texts due to ignoring the utilization of\ntext-specific shape information. Moreover, the time consumption of the entire\npipeline has been largely overlooked, leading to a suboptimal overall inference\nspeed. To address these issues, we first propose a novel parameterized text\nshape method based on low-rank approximation. Unlike other shape representation\nmethods that employ data-irrelevant parameterization, our approach utilizes\nsingular value decomposition and reconstructs the text shape using a few\neigenvectors learned from labeled text contours. By exploring the shape\ncorrelation among different text contours, our method achieves consistency,\ncompactness, simplicity, and robustness in shape representation. Next, we\npropose a dual assignment scheme for speed acceleration. It adopts a sparse\nassignment branch to accelerate the inference speed, and meanwhile, provides\nample supervised signals for training through a dense assignment branch.\nBuilding upon these designs, we implement an accurate and efficient\narbitrary-shaped text detector named LRANet. Extensive experiments are\nconducted on several challenging benchmarks, demonstrating the superior\naccuracy and efficiency of LRANet compared to state-of-the-art methods. Code\nwill be released soon.\n","authors":["Yuchen Su","Zhineng Chen","Zhiwen Shao","Yuning Du","Zhilong Ji","Jinfeng Bai","Yong Zhou","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2306.15142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16477v1","updated":"2023-08-31T05:43:46Z","published":"2023-08-31T05:43:46Z","title":"PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction","summary":" Vectorized high-definition map online construction has garnered considerable\nattention in the field of autonomous driving research. Most existing approaches\nmodel changeable map elements using a fixed number of points, or predict local\nmaps in a two-stage autoregressive manner, which may miss essential details and\nlead to error accumulation. Towards precise map element learning, we propose a\nsimple yet effective architecture named PivotNet, which adopts unified\npivot-based map representations and is formulated as a direct set prediction\nparadigm. Concretely, we first propose a novel Point-to-Line Mask module to\nencode both the subordinate and geometrical point-line priors in the network.\nThen, a well-designed Pivot Dynamic Matching module is proposed to model the\ntopology in dynamic point sequences by introducing the concept of sequence\nmatching. Furthermore, to supervise the position and topology of the vectorized\npoint predictions, we propose a Dynamic Vectorized Sequence loss. Extensive\nexperiments and ablations show that PivotNet is remarkably superior to other\nSOTAs by 5.9 mAP at least. The code will be available soon.\n","authors":["Wenjie Ding","Limeng Qiao","Xi Qiu","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16477v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.16466v1","updated":"2023-08-31T05:20:48Z","published":"2023-08-31T05:20:48Z","title":"Self-Sampling Meta SAM: Enhancing Few-shot Medical Image Segmentation\n with Meta-Learning","summary":" While the Segment Anything Model (SAM) excels in semantic segmentation for\ngeneral-purpose images, its performance significantly deteriorates when applied\nto medical images, primarily attributable to insufficient representation of\nmedical images in its training dataset. Nonetheless, gathering comprehensive\ndatasets and training models that are universally applicable is particularly\nchallenging due to the long-tail problem common in medical images. To address\nthis gap, here we present a Self-Sampling Meta SAM (SSM-SAM) framework for\nfew-shot medical image segmentation. Our innovation lies in the design of three\nkey modules: 1) An online fast gradient descent optimizer, further optimized by\na meta-learner, which ensures swift and robust adaptation to new tasks. 2) A\nSelf-Sampling module designed to provide well-aligned visual prompts for\nimproved attention allocation; and 3) A robust attention-based decoder\nspecifically designed for medical few-shot learning to capture relationship\nbetween different slices. Extensive experiments on a popular abdominal CT\ndataset and an MRI dataset demonstrate that the proposed method achieves\nsignificant improvements over state-of-the-art methods in few-shot\nsegmentation, with an average improvements of 10.21% and 1.80% in terms of DSC,\nrespectively. In conclusion, we present a novel approach for rapid online\nadaptation in interactive image segmentation, adapting to a new organ in just\n0.83 minutes. Code is publicly available on GitHub upon acceptance.\n","authors":["Yiming Zhang","Tianang Leng","Kun Han","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2308.16466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16463v1","updated":"2023-08-31T05:15:27Z","published":"2023-08-31T05:15:27Z","title":"Sparkles: Unlocking Chats Across Multiple Images for Multimodal\n Instruction-Following Models","summary":" Large language models exhibit enhanced zero-shot performance on various tasks\nwhen fine-tuned with instruction-following data. Multimodal\ninstruction-following models extend these capabilities by integrating both text\nand images. However, existing models such as MiniGPT-4 face challenges in\nmaintaining dialogue coherence in scenarios involving multiple images. A\nprimary reason is the lack of a specialized dataset for this critical\napplication. To bridge these gaps, we present SparklesChat, a multimodal\ninstruction-following model for open-ended dialogues across multiple images. To\nsupport the training, we introduce SparklesDialogue, the first\nmachine-generated dialogue dataset tailored for word-level interleaved\nmulti-image and text interactions. Furthermore, we construct SparklesEval, a\nGPT-assisted benchmark for quantitatively assessing a model's conversational\ncompetence across multiple images and dialogue turns. Our experiments validate\nthe effectiveness of SparklesChat in understanding and reasoning across\nmultiple images and dialogue turns. Specifically, SparklesChat outperformed\nMiniGPT-4 on established vision-and-language benchmarks, including the BISON\nbinary image selection task and the NLVR2 visual reasoning task. Moreover,\nSparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding\nMiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative\nevaluations further demonstrate SparklesChat's generality in handling\nreal-world applications. All resources will be available at\nhttps://github.com/HYPJUDY/Sparkles.\n","authors":["Yupan Huang","Zaiqiao Meng","Fangyu Liu","Yixuan Su","Nigel Collier","Yutong Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07864v2","updated":"2023-08-31T05:11:10Z","published":"2022-11-15T03:10:05Z","title":"Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning","summary":" Federated learning (FL) enables multiple clients to collaboratively train a\nglobal model without disclosing their data. Previous researches often require\ntraining the complete model parameters. However, the emergence of powerful\npre-trained models makes it possible to achieve higher performance with fewer\nlearnable parameters in FL. In this paper, we propose a federated adaptive\nprompt tuning algorithm, FedAPT, for multi-domain collaborative image\nclassification with powerful foundation models, like CLIP. Compared with direct\nfederated prompt tuning, our core idea is to adaptively unlock specific domain\nknowledge for each test sample in order to provide them with personalized\nprompts. To implement this idea, we design an adaptive prompt tuning module,\nwhich consists of a meta prompt, an adaptive network, and some keys. The server\nrandomly generates a set of keys and assigns a unique key to each client. Then\nall clients cooperatively train the global adaptive network and meta prompt\nwith the local datasets and the frozen keys. Ultimately, the global aggregation\nmodel can assign a personalized prompt to CLIP based on the domain features of\neach test sample. We perform extensive experiments on two multi-domain image\nclassification datasets across two different settings - supervised and\nunsupervised. The results show that FedAPT can achieve better performance with\nless than 10\\% of the number of parameters of the fully trained model, and the\nglobal model can perform well in diverse client domains simultaneously.\n","authors":["Shangchao Su","Mingzhao Yang","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2211.07864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05602v2","updated":"2023-08-31T05:08:45Z","published":"2023-05-09T16:51:00Z","title":"Collaborative Chinese Text Recognition with Personalized Federated\n Learning","summary":" In Chinese text recognition, to compensate for the insufficient local data\nand improve the performance of local few-shot character recognition, it is\noften necessary for one organization to collect a large amount of data from\nsimilar organizations. However, due to the natural presence of private\ninformation in text data, such as addresses and phone numbers, different\norganizations are unwilling to share private data. Therefore, it becomes\nincreasingly important to design a privacy-preserving collaborative training\nframework for the Chinese text recognition task. In this paper, we introduce\npersonalized federated learning (pFL) into the Chinese text recognition task\nand propose the pFedCR algorithm, which significantly improves the model\nperformance of each client (organization) without sharing private data.\nSpecifically, pFedCR comprises two stages: multiple rounds of global model\ntraining stage and the the local personalization stage. During stage 1, an\nattention mechanism is incorporated into the CRNN model to adapt to various\nclient data distributions. Leveraging inherent character data characteristics,\na balanced dataset is created on the server to mitigate character imbalance. In\nthe personalization phase, the global model is fine-tuned for one epoch to\ncreate a local model. Parameter averaging between local and global models\ncombines personalized and global feature extraction capabilities. Finally, we\nfine-tune only the attention layers to enhance its focus on local personalized\nfeatures. The experimental results on three real-world industrial scenario\ndatasets show that the pFedCR algorithm can improve the performance of local\npersonalized models by about 20\\% while also improving their generalization\nperformance on other client data domains. Compared to other state-of-the-art\npersonalized federated learning methods, pFedCR improves performance by 6\\%\n$\\sim$ 8\\%.\n","authors":["Shangchao Su","Haiyang Yu","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2305.05602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16461v1","updated":"2023-08-31T05:05:53Z","published":"2023-08-31T05:05:53Z","title":"Domain Adaptive Synapse Detection with Weak Point Annotations","summary":" The development of learning-based methods has greatly improved the detection\nof synapses from electron microscopy (EM) images. However, training a model for\neach dataset is time-consuming and requires extensive annotations.\nAdditionally, it is difficult to apply a learned model to data from different\nbrain regions due to variations in data distributions. In this paper, we\npresent AdaSyn, a two-stage segmentation-based framework for domain adaptive\nsynapse detection with weak point annotations. In the first stage, we address\nthe detection problem by utilizing a segmentation-based pipeline to obtain\nsynaptic instance masks. In the second stage, we improve model generalizability\non target data by regenerating square masks to get high-quality pseudo labels.\nBenefiting from our high-accuracy detection results, we introduce the distance\nnearest principle to match paired pre-synapses and post-synapses. In the\nWASPSYN challenge at ISBI 2023, our method ranks the 1st place.\n","authors":["Qi Chen","Wei Huang","Yueyi Zhang","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.16461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16460v1","updated":"2023-08-31T04:58:17Z","published":"2023-08-31T04:58:17Z","title":"Improving Lens Flare Removal with General Purpose Pipeline and Multiple\n Light Sources Recovery","summary":" When taking images against strong light sources, the resulting images often\ncontain heterogeneous flare artifacts. These artifacts can importantly affect\nimage visual quality and downstream computer vision tasks. While collecting\nreal data pairs of flare-corrupted/flare-free images for training flare removal\nmodels is challenging, current methods utilize the direct-add approach to\nsynthesize data. However, these methods do not consider automatic exposure and\ntone mapping in image signal processing pipeline (ISP), leading to the limited\ngeneralization capability of deep models training using such data. Besides,\nexisting methods struggle to handle multiple light sources due to the different\nsizes, shapes and illuminance of various light sources. In this paper, we\npropose a solution to improve the performance of lens flare removal by\nrevisiting the ISP and remodeling the principle of automatic exposure in the\nsynthesis pipeline and design a more reliable light sources recovery strategy.\nThe new pipeline approaches realistic imaging by discriminating the local and\nglobal illumination through convex combination, avoiding global illumination\nshifting and local over-saturation. Our strategy for recovering multiple light\nsources convexly averages the input and output of the neural network based on\nilluminance levels, thereby avoiding the need for a hard threshold in\nidentifying light sources. We also contribute a new flare removal testing\ndataset containing the flare-corrupted images captured by ten types of consumer\nelectronics. The dataset facilitates the verification of the generalization\ncapability of flare removal methods. Extensive experiments show that our\nsolution can effectively improve the performance of lens flare removal and push\nthe frontier toward more general situations.\n","authors":["Yuyan Zhou","Dong Liang","Songcan Chen","Sheng-Jun Huang","Shuo Yang","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.16460v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.16454v1","updated":"2023-08-31T04:46:12Z","published":"2023-08-31T04:46:12Z","title":"Adversarial Finetuning with Latent Representation Constraint to Mitigate\n Accuracy-Robustness Tradeoff","summary":" This paper addresses the tradeoff between standard accuracy on clean examples\nand robustness against adversarial examples in deep neural networks (DNNs).\nAlthough adversarial training (AT) improves robustness, it degrades the\nstandard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we\npropose a novel AT method called ARREST, which comprises three components: (i)\nadversarial finetuning (AFT), (ii) representation-guided knowledge distillation\n(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples\nby initializing its parameters with a DNN that is standardly pretrained on\nclean examples. RGKD and NR respectively entail a regularization term and an\nalgorithm to preserve latent representations of clean examples during AFT. RGKD\npenalizes the distance between the representations of the standardly pretrained\nand AFT DNNs. NR switches input adversarial examples to nonadversarial ones\nwhen the representation changes significantly during AFT. By combining these\ncomponents, ARREST achieves both high standard accuracy and robustness.\nExperimental results demonstrate that ARREST mitigates the tradeoff more\neffectively than previous AT-based methods do.\n","authors":["Satoshi Suzuki","Shin'ya Yamaguchi","Shoichiro Takeda","Sekitoshi Kanai","Naoki Makishima","Atsushi Ando","Ryo Masumura"],"pdf_url":"https://arxiv.org/pdf/2308.16454v1.pdf","comment":"Accepted by International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2304.06028v2","updated":"2023-08-31T04:36:04Z","published":"2023-04-12T17:59:58Z","title":"RECLIP: Resource-efficient CLIP by Training with Small Images","summary":" We present RECLIP (Resource-efficient CLIP), a simple method that minimizes\ncomputational resource footprint for CLIP (Contrastive Language Image\nPretraining). Inspired by the notion of coarse-to-fine in computer vision, we\nleverage small images to learn from large-scale language supervision\nefficiently, and finetune the model with high-resolution data in the end. Since\nthe complexity of the vision transformer heavily depends on input image size,\nour approach significantly reduces the training resource requirements both in\ntheory and in practice. Using the same batch size and training epoch, RECLIP\nachieves highly competitive zero-shot classification and image-text retrieval\naccuracy with 6 to 8x less computational resources and 7 to 9x fewer FLOPs than\nthe baseline. Compared to the state-of-the-art contrastive learning methods,\nRECLIP demonstrates 5 to 59x training resource savings while maintaining highly\ncompetitive zero-shot classification and retrieval performance. Finally, RECLIP\nmatches the state of the art in transfer learning to open-vocabulary detection\ntasks, achieving 32 APr on LVIS. We hope this work will pave the path for the\nbroader research community to explore language supervised pretraining in\nresource-friendly settings.\n","authors":["Runze Li","Dahun Kim","Bir Bhanu","Weicheng Kuo"],"pdf_url":"https://arxiv.org/pdf/2304.06028v2.pdf","comment":"Published at Transactions on Machine Learning Research"},{"id":"http://arxiv.org/abs/2207.13085v3","updated":"2023-08-31T04:00:18Z","published":"2022-07-26T17:57:58Z","title":"Group DETR: Fast DETR Training with Group-Wise One-to-Many Assignment","summary":" Detection transformer (DETR) relies on one-to-one assignment, assigning one\nground-truth object to one prediction, for end-to-end detection without NMS\npost-processing. It is known that one-to-many assignment, assigning one\nground-truth object to multiple predictions, succeeds in detection methods such\nas Faster R-CNN and FCOS. While the naive one-to-many assignment does not work\nfor DETR, and it remains challenging to apply one-to-many assignment for DETR\ntraining. In this paper, we introduce Group DETR, a simple yet efficient DETR\ntraining approach that introduces a group-wise way for one-to-many assignment.\nThis approach involves using multiple groups of object queries, conducting\none-to-one assignment within each group, and performing decoder self-attention\nseparately. It resembles data augmentation with automatically-learned object\nquery augmentation. It is also equivalent to simultaneously training\nparameter-sharing networks of the same architecture, introducing more\nsupervision and thus improving DETR training. The inference process is the same\nas DETR trained normally and only needs one group of queries without any\narchitecture modification. Group DETR is versatile and is applicable to various\nDETR variants. The experiments show that Group DETR significantly speeds up the\ntraining convergence and improves the performance of various DETR-based models.\nCode will be available at \\url{https://github.com/Atten4Vis/GroupDETR}.\n","authors":["Qiang Chen","Xiaokang Chen","Jian Wang","Shan Zhang","Kun Yao","Haocheng Feng","Junyu Han","Errui Ding","Gang Zeng","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2207.13085v3.pdf","comment":"ICCV23 camera ready version"},{"id":"http://arxiv.org/abs/2308.16435v1","updated":"2023-08-31T03:49:41Z","published":"2023-08-31T03:49:41Z","title":"Njobvu-AI: An open-source tool for collaborative image labeling and\n implementation of computer vision models","summary":" Practitioners interested in using computer vision models lack user-friendly\nand open-source software that combines features to label training data, allow\nmultiple users, train new algorithms, review output, and implement new models.\nLabeling training data, such as images, is a key step to developing accurate\nobject detection algorithms using computer vision. This step is often not\ncompatible with many cloud-based services for marking or labeling image and\nvideo data due to limited internet bandwidth in many regions of the world.\nDesktop tools are useful for groups working in remote locations, but users\noften do not have the capability to combine projects developed locally by\nmultiple collaborators. Furthermore, many tools offer features for labeling\ndata or using pre-trained models for classification, but few allow researchers\nto combine these steps to create and apply custom models. Free, open-source,\nand user-friendly software that offers a full suite of features (e.g., ability\nto work locally and online, and train custom models) is desirable to field\nresearchers and conservationists that may have limited coding skills. We\ndeveloped Njobvu-AI, a free, open-source tool that can be run on both desktop\nand server hardware using Node.js, allowing users to label data, combine\nprojects for collaboration and review, train custom algorithms, and implement\nnew computer vision models. The name Njobvu-AI (pronounced N-joh-voo AI),\nincorporating the Chichewa word for elephant, is inspired by a wildlife\nmonitoring program in Malawi that was a primary impetus for the development of\nthis tool and references similarities between the powerful memory of elephants\nand properties of computer vision models.\n","authors":["Jonathan S. Koning","Ashwin Subramanian","Mazen Alotaibi","Cara L. Appel","Christopher M. Sullivan","Thon Chao","Lisa Truong","Robyn L. Tanguay","Pankaj Jaiswal","Taal Levi","Damon B. Lesmeister"],"pdf_url":"https://arxiv.org/pdf/2308.16435v1.pdf","comment":"13 pages, 6 figures. For code and documentation, see\n https://github.com/sullichrosu/Njobvu-AI/"},{"id":"http://arxiv.org/abs/2307.07873v5","updated":"2023-08-31T03:47:35Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding and Improving Adversarial\n Transferability from Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v5.pdf","comment":"IEEE Symposium on Security and Privacy (Oakland) 2024; Extended\n version of camera-ready"},{"id":"http://arxiv.org/abs/2305.07283v3","updated":"2023-08-31T03:47:34Z","published":"2023-05-12T06:56:22Z","title":"Quaternion-valued Correlation Learning for Few-Shot Semantic\n Segmentation","summary":" Few-shot segmentation (FSS) aims to segment unseen classes given only a few\nannotated samples. Encouraging progress has been made for FSS by leveraging\nsemantic features learned from base classes with sufficient training samples to\nrepresent novel classes. The correlation-based methods lack the ability to\nconsider interaction of the two subspace matching scores due to the inherent\nnature of the real-valued 2D convolutions. In this paper, we introduce a\nquaternion perspective on correlation learning and propose a novel\nQuaternion-valued Correlation Learning Network (QCLNet), with the aim to\nalleviate the computational burden of high-dimensional correlation tensor and\nexplore internal latent interaction between query and support images by\nleveraging operations defined by the established quaternion algebra.\nSpecifically, our QCLNet is formulated as a hyper-complex valued network and\nrepresents correlation tensors in the quaternion domain, which uses\nquaternion-valued convolution to explore the external relations of query\nsubspace when considering the hidden relationship of the support sub-dimension\nin the quaternion space. Extensive experiments on the PASCAL-5i and COCO-20i\ndatasets demonstrate that our method outperforms the existing state-of-the-art\nmethods effectively. Our code is available at\nhttps://github.com/zwzheng98/QCLNet and our article \"Quaternion-valued\nCorrelation Learning for Few-Shot Semantic Segmentation\" was published in IEEE\nTransactions on Circuits and Systems for Video Technology, vol.\n33,no.5,pp.2102-2115,May 2023,doi: 10.1109/TCSVT.2022.3223150.\n","authors":["Zewen Zheng","Guoheng Huang","Xiaochen Yuan","Chi-Man Pun","Hongrui Liu","Wing-Kuen Ling"],"pdf_url":"https://arxiv.org/pdf/2305.07283v3.pdf","comment":"for associated paper file, see\n https://ieeexplore.ieee.org/document/9954424?source=authoralert"},{"id":"http://arxiv.org/abs/2308.14604v2","updated":"2023-08-31T03:07:03Z","published":"2023-08-28T14:17:16Z","title":"SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space\n Reconstruction","summary":" Segment Anything Model (SAM) has received remarkable attention as it offers a\npowerful and versatile solution for object segmentation in images. However,\nfine-tuning SAM for downstream segmentation tasks under different scenarios\nremains a challenge, as the varied characteristics of different scenarios\nnaturally requires diverse model parameter spaces. Most existing fine-tuning\nmethods attempt to bridge the gaps among different scenarios by introducing a\nset of new parameters to modify SAM's original parameter space. Unlike these\nworks, in this paper, we propose fine-tuning SAM efficiently by parameter space\nreconstruction (SAM-PARSER), which introduce nearly zero trainable parameters\nduring fine-tuning. In SAM-PARSER, we assume that SAM's original parameter\nspace is relatively complete, so that its bases are able to reconstruct the\nparameter space of a new scenario. We obtain the bases by matrix decomposition,\nand fine-tuning the coefficients to reconstruct the parameter space tailored to\nthe new scenario by an optimal linear combination of the bases. Experimental\nresults show that SAM-PARSER exhibits superior segmentation performance across\nvarious scenarios, while reducing the number of trainable parameters by\n$\\approx 290$ times compared with current parameter-efficient fine-tuning\nmethods.\n","authors":["Zelin Peng","Zhengqin Xu","Zhilin Zeng","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.14604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.00780v5","updated":"2023-08-31T02:27:48Z","published":"2022-07-26T10:59:42Z","title":"Visual correspondence-based explanations improve AI robustness and\n human-AI team accuracy","summary":" Explaining artificial intelligence (AI) predictions is increasingly important\nand even imperative in many high-stakes applications where humans are the\nultimate decision-makers. In this work, we propose two novel architectures of\nself-interpretable image classifiers that first explain, and then predict (as\nopposed to post-hoc explanations) by harnessing the visual correspondences\nbetween a query image and exemplars. Our models consistently improve (by 1 to 4\npoints) on out-of-distribution (OOD) datasets while performing marginally worse\n(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest\nneighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB,\nour correspondence-based explanations are found to be more useful to users than\nkNN explanations. Our explanations help users more accurately reject AI's wrong\ndecisions than all other tested methods. Interestingly, for the first time, we\nshow that it is possible to achieve complementary human-AI team accuracy (i.e.,\nthat is higher than either AI-alone or human-alone), in ImageNet and CUB image\nclassification tasks.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2208.00780v5.pdf","comment":"NeurIPS 2022 conference paper"},{"id":"http://arxiv.org/abs/2308.15690v2","updated":"2023-08-31T02:21:20Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v2.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"},{"id":"http://arxiv.org/abs/2308.16404v1","updated":"2023-08-31T02:13:15Z","published":"2023-08-31T02:13:15Z","title":"Deformation Robust Text Spotting with Geometric Prior","summary":" The goal of text spotting is to perform text detection and recognition in an\nend-to-end manner. Although the diversity of luminosity and orientation in\nscene texts has been widely studied, the font diversity and shape variance of\nthe same character are ignored in recent works, since most characters in\nnatural images are rendered in standard fonts. To solve this problem, we\npresent a Chinese Artistic Dataset, termed as ARText, which contains 33,000\nartistic images with rich shape deformation and font diversity. Based on this\ndatabase, we develop a deformation robust text spotting method (DR TextSpotter)\nto solve the recognition problem of complex deformation of characters in\ndifferent fonts. Specifically, we propose a geometric prior module to highlight\nthe important features based on the unsupervised landmark detection\nsub-network. A graph convolution network is further constructed to fuse the\ncharacter features and landmark features, and then performs semantic reasoning\nto enhance the discrimination for different characters. The experiments are\nconducted on ARText and IC19-ReCTS datasets. Our results demonstrate the\neffectiveness of our proposed method.\n","authors":["Xixuan Hao","Aozhong Zhang","Xianze Meng","Bin Fu"],"pdf_url":"https://arxiv.org/pdf/2308.16404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16145v2","updated":"2023-08-31T01:29:35Z","published":"2023-08-30T17:01:01Z","title":"CircleFormer: Circular Nuclei Detection in Whole Slide Images with\n Circle Queries and Attention","summary":" Both CNN-based and Transformer-based object detection with bounding box\nrepresentation have been extensively studied in computer vision and medical\nimage analysis, but circular object detection in medical images is still\nunderexplored. Inspired by the recent anchor free CNN-based circular object\ndetection method (CircleNet) for ball-shape glomeruli detection in renal\npathology, in this paper, we present CircleFormer, a Transformer-based circular\nmedical object detection with dynamic anchor circles. Specifically, queries\nwith circle representation in Transformer decoder iteratively refine the\ncircular object detection results, and a circle cross attention module is\nintroduced to compute the similarity between circular queries and image\nfeatures. A generalized circle IoU (gCIoU) is proposed to serve as a new\nregression loss of circular object detection as well. Moreover, our approach is\neasy to generalize to the segmentation task by adding a simple segmentation\nbranch to CircleFormer. We evaluate our method in circular nuclei detection and\nsegmentation on the public MoNuSeg dataset, and the experimental results show\nthat our method achieves promising performance compared with the\nstate-of-the-art approaches. The effectiveness of each component is validated\nvia ablation studies as well. Our code is released at\nhttps://github.com/zhanghx-iim-ahu/CircleFormer.\n","authors":["Hengxu Zhang","Pengpeng Liang","Zhiyong Sun","Bo Song","Erkang Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.16145v2.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.16386v1","updated":"2023-08-31T01:13:01Z","published":"2023-08-31T01:13:01Z","title":"RGB-T Tracking via Multi-Modal Mutual Prompt Learning","summary":" Object tracking based on the fusion of visible and thermal im-ages, known as\nRGB-T tracking, has gained increasing atten-tion from researchers in recent\nyears. How to achieve a more comprehensive fusion of information from the two\nmodalities with fewer computational costs has been a problem that re-searchers\nhave been exploring. Recently, with the rise of prompt learning in computer\nvision, we can better transfer knowledge from visual large models to downstream\ntasks. Considering the strong complementarity between visible and thermal\nmodalities, we propose a tracking architecture based on mutual prompt learning\nbetween the two modalities. We also design a lightweight prompter that\nincorporates attention mechanisms in two dimensions to transfer information\nfrom one modality to the other with lower computational costs, embedding it\ninto each layer of the backbone. Extensive ex-periments have demonstrated that\nour proposed tracking ar-chitecture is effective and efficient, achieving\nstate-of-the-art performance while maintaining high running speeds.\n","authors":["Yang Luo","Xiqing Guo","Hui Feng","Lei Ao"],"pdf_url":"https://arxiv.org/pdf/2308.16386v1.pdf","comment":"9 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.16383v1","updated":"2023-08-31T01:00:59Z","published":"2023-08-31T01:00:59Z","title":"Separate and Locate: Rethink the Text in Text-based Visual Question\n Answering","summary":" Text-based Visual Question Answering (TextVQA) aims at answering questions\nabout the text in images. Most works in this field focus on designing network\nstructures or pre-training tasks. All these methods list the OCR texts in\nreading order (from left to right and top to bottom) to form a sequence, which\nis treated as a natural language ``sentence''. However, they ignore the fact\nthat most OCR words in the TextVQA task do not have a semantical contextual\nrelationship. In addition, these approaches use 1-D position embedding to\nconstruct the spatial relation between OCR tokens sequentially, which is not\nreasonable. The 1-D position embedding can only represent the left-right\nsequence relationship between words in a sentence, but not the complex spatial\nposition relationship. To tackle these problems, we propose a novel method\nnamed Separate and Locate (SaL) that explores text contextual cues and designs\nspatial position embedding to construct spatial relations between OCR texts.\nSpecifically, we propose a Text Semantic Separate (TSS) module that helps the\nmodel recognize whether words have semantic contextual relations. Then, we\nintroduce a Spatial Circle Position (SCP) module that helps the model better\nconstruct and reason the spatial position relationships between OCR texts. Our\nSaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA\nand ST-VQA datasets. Compared with the pre-training state-of-the-art method\npre-trained on 64 million pre-training samples, our method, without any\npre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on\nTextVQA and ST-VQA. Our code and models will be released at\nhttps://github.com/fangbufang/SaL.\n","authors":["Chengyang Fang","Jiangnan Li","Liang Li","Can Ma","Dayong Hu"],"pdf_url":"https://arxiv.org/pdf/2308.16383v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/1912.10122v3","updated":"2023-08-31T00:53:12Z","published":"2019-12-20T22:17:50Z","title":"A Region-based Randers Geodesic Approach for Image Segmentation","summary":" The geodesic model based on the eikonal partial differential equation (PDE)\nhas served as a fundamental tool for the applications of image segmentation and\nboundary detection in the past two decades. However, the existing approaches\ncommonly only exploit the image edge-based features for computing minimal\ngeodesic paths, potentially limiting their performance in complicated\nsegmentation situations. In this paper, we introduce a new variational image\nsegmentation model based on the minimal geodesic path framework and the eikonal\nPDE, where the region-based appearance term that defines then regional\nhomogeneity features can be taken into account for estimating the associated\nminimal geodesic paths. This is done by constructing a Randers geodesic metric\ninterpretation of the region-based active contour energy functional. As a\nresult, the minimization of the active contour energy functional is transformed\ninto finding the solution to the Randers eikonal PDE.\n We also suggest a practical interactive image segmentation strategy, where\nthe target boundary can be delineated by the concatenation of several piecewise\ngeodesic paths. We invoke the Finsler variant of the fast marching method to\nestimate the geodesic distance map, yielding an efficient implementation of the\nproposed region-based Randers geodesic model for image segmentation.\nExperimental results on both synthetic and real images exhibit that our model\nindeed achieves encouraging segmentation performance.\n","authors":["Da Chen","Jean-Marie Mirebeau","Huazhong Shu","Laurent D. Cohen"],"pdf_url":"https://arxiv.org/pdf/1912.10122v3.pdf","comment":"To Appear in International Journal of Computer Vision"},{"id":"http://arxiv.org/abs/2308.16154v2","updated":"2023-08-31T00:51:45Z","published":"2023-08-30T17:20:46Z","title":"MMVP: Motion-Matrix-based Video Prediction","summary":" A central challenge of video prediction lies where the system has to reason\nthe objects' future motions from image frames while simultaneously maintaining\nthe consistency of their appearances across frames. This work introduces an\nend-to-end trainable two-stream video prediction framework, Motion-Matrix-based\nVideo Prediction (MMVP), to tackle this challenge. Unlike previous methods that\nusually handle motion prediction and appearance maintenance within the same set\nof modules, MMVP decouples motion and appearance information by constructing\nappearance-agnostic motion matrices. The motion matrices represent the temporal\nsimilarity of each and every pair of feature patches in the input frames, and\nare the sole input of the motion prediction module in MMVP. This design\nimproves video prediction in both accuracy and efficiency, and reduces the\nmodel size. Results of extensive experiments demonstrate that MMVP outperforms\nstate-of-the-art systems on public data sets by non-negligible large margins\n(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the\nsize or smaller).\n","authors":["Yiqi Zhong","Luming Liang","Ilya Zharkov","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16154v2.pdf","comment":"ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2308.16380v1","updated":"2023-08-31T00:48:05Z","published":"2023-08-31T00:48:05Z","title":"3D vision-based structural masonry damage detection","summary":" The detection of masonry damage is essential for preventing potentially\ndisastrous outcomes. Manual inspection can, however, take a long time and be\nhazardous to human inspectors. Automation of the inspection process using novel\ncomputer vision and machine learning algorithms can be a more efficient and\nsafe solution to prevent further deterioration of the masonry structures. Most\nexisting 2D vision-based methods are limited to qualitative damage\nclassification, 2D localization, and in-plane quantification. In this study, we\npresent a 3D vision-based methodology for accurate masonry damage detection,\nwhich offers a more robust solution with a greater field of view, depth of\nvision, and the ability to detect failures in complex environments. First,\nimages of the masonry specimens are collected to generate a 3D point cloud.\nSecond, 3D point clouds processing methods are developed to evaluate the\nmasonry damage. We demonstrate the effectiveness of our approach through\nexperiments on structural masonry components. Our experiments showed the\nproposed system can effectively classify damage states and localize and\nquantify critical damage features. The result showed the proposed method can\nimprove the level of autonomy during the inspection of masonry structures.\n","authors":["Elmira Faraji Zonouz","Xiao Pan","Yu-Cheng Hsu","Tony Yang"],"pdf_url":"https://arxiv.org/pdf/2308.16380v1.pdf","comment":"10 pages, accepted in the Canadian Conference - Pacific Conference on\n Earthquake Engineering 2023, Vancouver, British Columbia"},{"id":"http://arxiv.org/abs/2308.15791v2","updated":"2023-08-31T00:46:47Z","published":"2023-08-30T06:49:34Z","title":"Neural Video Compression with Temporal Layer-Adaptive Hierarchical\n B-frame Coding","summary":" Neural video compression (NVC) is a rapidly evolving video coding research\narea, with some models achieving superior coding efficiency compared to the\nlatest video coding standard Versatile Video Coding (VVC). In conventional\nvideo coding standards, the hierarchical B-frame coding, which utilizes a\nbidirectional prediction structure for higher compression, had been\nwell-studied and exploited. In NVC, however, limited research has investigated\nthe hierarchical B scheme. In this paper, we propose an NVC model exploiting\nhierarchical B-frame coding with temporal layer-adaptive optimization. We first\nextend an existing unidirectional NVC model to a bidirectional model, which\nachieves -21.13% BD-rate gain over the unidirectional baseline model. However,\nthis model faces challenges when applied to sequences with complex or large\nmotions, leading to performance degradation. To address this, we introduce\ntemporal layer-adaptive optimization, incorporating methods such as temporal\nlayer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent\nscaling (TALS). The final model with the proposed methods achieves an\nimpressive BD-rate gain of -39.86% against the baseline. It also resolves the\nchallenges in sequences with large or complex motions with up to -49.13% more\nBD-rate gains than the simple bidirectional extension. This improvement is\nattributed to the allocation of more bits to lower temporal layers, thereby\nenhancing overall reconstruction quality with smaller bits. Since our method\nhas little dependency on a specific NVC model architecture, it can serve as a\ngeneral tool for extending unidirectional NVC models to the ones with\nhierarchical B-frame coding.\n","authors":["Yeongwoong Kim","Suyong Bahk","Seungeon Kim","Won Hee Lee","Dokwan Oh","Hui Yong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16376v1","updated":"2023-08-31T00:36:10Z","published":"2023-08-31T00:36:10Z","title":"Improving Multiple Sclerosis Lesion Segmentation Across Clinical Sites:\n A Federated Learning Approach with Noise-Resilient Training","summary":" Accurately measuring the evolution of Multiple Sclerosis (MS) with magnetic\nresonance imaging (MRI) critically informs understanding of disease progression\nand helps to direct therapeutic strategy. Deep learning models have shown\npromise for automatically segmenting MS lesions, but the scarcity of accurately\nannotated data hinders progress in this area. Obtaining sufficient data from a\nsingle clinical site is challenging and does not address the heterogeneous need\nfor model robustness. Conversely, the collection of data from multiple sites\nintroduces data privacy concerns and potential label noise due to varying\nannotation standards. To address this dilemma, we explore the use of the\nfederated learning framework while considering label noise. Our approach\nenables collaboration among multiple clinical sites without compromising data\nprivacy under a federated learning paradigm that incorporates a noise-robust\ntraining strategy based on label correction. Specifically, we introduce a\nDecoupled Hard Label Correction (DHLC) strategy that considers the imbalanced\ndistribution and fuzzy boundaries of MS lesions, enabling the correction of\nfalse annotations based on prediction confidence. We also introduce a Centrally\nEnhanced Label Correction (CELC) strategy, which leverages the aggregated\ncentral model as a correction teacher for all sites, enhancing the reliability\nof the correction process. Extensive experiments conducted on two multi-site\ndatasets demonstrate the effectiveness and robustness of our proposed methods,\nindicating their potential for clinical applications in multi-site\ncollaborations.\n","authors":["Lei Bai","Dongang Wang","Michael Barnett","Mariano Cabezas","Weidong Cai","Fernando Calamante","Kain Kyle","Dongnan Liu","Linda Ly","Aria Nguyen","Chun-Chien Shieh","Ryan Sullivan","Hengrui Wang","Geng Zhan","Wanli Ouyang","Chenyu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16376v1.pdf","comment":"11 pages, 4 figures, journal submission"},{"id":"http://arxiv.org/abs/2309.00174v1","updated":"2023-08-31T23:58:25Z","published":"2023-08-31T23:58:25Z","title":"Typing on Any Surface: A Deep Learning-based Method for Real-Time\n Keystroke Detection in Augmented Reality","summary":" Frustrating text entry interface has been a major obstacle in participating\nin social activities in augmented reality (AR). Popular options, such as\nmid-air keyboard interface, wireless keyboards or voice input, either suffer\nfrom poor ergonomic design, limited accuracy, or are simply embarrassing to use\nin public. This paper proposes and validates a deep-learning based approach,\nthat enables AR applications to accurately predict keystrokes from the user\nperspective RGB video stream that can be captured by any AR headset. This\nenables a user to perform typing activities on any flat surface and eliminates\nthe need of a physical or virtual keyboard. A two-stage model, combing an\noff-the-shelf hand landmark extractor and a novel adaptive Convolutional\nRecurrent Neural Network (C-RNN), was trained using our newly built dataset.\nThe final model was capable of adaptive processing user-perspective video\nstreams at ~32 FPS. This base model achieved an overall accuracy of $91.05\\%$\nwhen typing 40 Words per Minute (wpm), which is how fast an average person\ntypes with two hands on a physical keyboard. The Normalised Levenshtein\nDistance also further confirmed the real-world applicability of that our\napproach. The promising results highlight the viability of our approach and the\npotential for our method to be integrated into various applications. We also\ndiscussed the limitations and future research required to bring such technique\ninto a production system.\n","authors":["Xingyu Fu","Mingze Xi"],"pdf_url":"https://arxiv.org/pdf/2309.00174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05831v3","updated":"2023-08-31T23:57:18Z","published":"2022-05-12T01:54:22Z","title":"Feature Extractor Stacking for Cross-domain Few-shot Meta-learning","summary":" Cross-domain few-shot meta-learning (CDFSML) addresses learning problems\nwhere knowledge needs to be transferred from several source domains into an\ninstance-scarce target domain with an explicitly different distribution.\nRecently published CDFSML methods generally construct a universal model that\ncombines knowledge of multiple source domains into one backbone feature\nextractor. This enables efficient inference but necessitates re-computation of\nthe backbone whenever a new source domain is added. Some of these methods are\nalso incompatible with heterogeneous source domain backbone architectures. We\npropose feature extractor stacking (FES), a new CDFSML method for combining\ninformation from a collection of backbones, which can utilise heterogeneous\npretrained backbones out of the box, and does not maintain a universal model\nthat needs to be re-computed when its backbone collection is updated. We\npresent the basic FES algorithm, which is inspired by the classic stacking\napproach to meta-learning, and also introduce two variants: convolutional FES\n(ConFES) and regularised FES (ReFES). Given a target-domain task, these\nalgorithms fine-tune each backbone independently, use cross-validation to\nextract meta training data from the support set, and learn a simple linear\nmeta-classifier from this data. We evaluate our FES methods on the well-known\nMeta-Dataset benchmark, targeting image classification with convolutional\nneural networks, and show that they can achieve state-of-the-art performance.\n","authors":["Hongyu Wang","Eibe Frank","Bernhard Pfahringer","Michael Mayo","Geoffrey Holmes"],"pdf_url":"https://arxiv.org/pdf/2205.05831v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00168v1","updated":"2023-08-31T23:17:44Z","published":"2023-08-31T23:17:44Z","title":"Pose-Graph Attentional Graph Neural Network for Lidar Place Recognition","summary":" This paper proposes a lidar place recognition approach, called P-GAT, to\nincrease the receptive field between point clouds captured over time. Instead\nof comparing pairs of point clouds, we compare the similarity between sets of\npoint clouds to use the maximum spatial and temporal information between\nneighbour clouds utilising the concept of pose-graph SLAM. Leveraging intra-\nand inter-attention and graph neural network, P-GAT relates point clouds\ncaptured in nearby locations in Euclidean space and their embeddings in feature\nspace. Experimental results on the large-scale publically available datasets\ndemonstrate the effectiveness of our approach in recognising scenes lacking\ndistinct features and when training and testing environments have different\ndistributions (domain adaptation). Further, an exhaustive comparison with the\nstate-of-the-art shows improvements in performance gains. Code will be\navailable upon acceptance.\n","authors":["Milad Ramezani","Liang Wang","Joshua Knights","Zhibin Li","Pauline Pounds","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2309.00168v1.pdf","comment":"8 pages, 3 figures, 5 tables"},{"id":"http://arxiv.org/abs/2306.11300v2","updated":"2023-08-31T22:33:54Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Foundation Models utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. In this paper, we propose a\nnew framework that includes the Domain Foundation Model (DFM), bridging the gap\nbetween the General Foundation Model (GFM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\ntried several Parameter-Efficient Fine-Tuning methods on RS5M to implement the\nDFM. Experimental results show that our proposed dataset are highly effective\nfor various tasks, improving upon the baseline by $8 \\% \\sim 16 \\%$ in\nzero-shot classification tasks, and obtaining good results in both\nVision-Language Retrieval and Semantic Localization tasks.\n\\url{https://github.com/om-ai-lab/RS5M}\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v2.pdf","comment":"RS5M dataset v4"},{"id":"http://arxiv.org/abs/2309.00158v1","updated":"2023-08-31T22:17:48Z","published":"2023-08-31T22:17:48Z","title":"BuilDiff: 3D Building Shape Generation using Single-Image Conditional\n Point Cloud Diffusion Models","summary":" 3D building generation with low data acquisition costs, such as single\nimage-to-3D, becomes increasingly important. However, most of the existing\nsingle image-to-3D building creation works are restricted to those images with\nspecific viewing angles, hence they are difficult to scale to general-view\nimages that commonly appear in practical cases. To fill this gap, we propose a\nnovel 3D building shape generation method exploiting point cloud diffusion\nmodels with image conditioning schemes, which demonstrates flexibility to the\ninput images. By cooperating two conditional diffusion models and introducing a\nregularization strategy during denoising process, our method is able to\nsynthesize building roofs while maintaining the overall structures. We validate\nour framework on two newly built datasets and extensive experiments show that\nour method outperforms previous works in terms of building generation quality.\n","authors":["Yao Wei","George Vosselman","Michael Ying Yang"],"pdf_url":"https://arxiv.org/pdf/2309.00158v1.pdf","comment":"10 pages, 6 figures, accepted to ICCVW2023"},{"id":"http://arxiv.org/abs/2306.10720v4","updated":"2023-08-31T22:13:27Z","published":"2023-06-19T06:41:19Z","title":"Exploring the Relationship between Samples and Masks for Robust Defect\n Localization","summary":" Defect detection aims to detect and localize regions out of the normal\ndistribution.Previous approaches model normality and compare it with the input\nto identify defective regions, potentially limiting their generalizability.This\npaper proposes a one-stage framework that detects defective patterns directly\nwithout the modeling process.This ability is adopted through the joint efforts\nof three parties: a generative adversarial network (GAN), a newly proposed\nscaled pattern loss, and a dynamic masked cycle-consistent auxiliary network.\nExplicit information that could indicate the position of defects is\nintentionally excluded to avoid learning any direct mapping.Experimental\nresults on the texture class of the challenging MVTec AD dataset show that the\nproposed method is 2.9% higher than the SOTA methods in F1-Score, while\nsubstantially outperforming SOTA methods in generalizability.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10720v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15068v2","updated":"2023-08-31T22:11:54Z","published":"2023-08-29T07:00:35Z","title":"A Comprehensive Augmentation Framework for Anomaly Detection","summary":" Data augmentation methods are commonly integrated into the training of\nanomaly detection models. Previous approaches have primarily focused on\nreplicating real-world anomalies or enhancing diversity, without considering\nthat the standard of anomaly varies across different classes, potentially\nleading to a biased training distribution.This paper analyzes crucial traits of\nsimulated anomalies that contribute to the training of reconstructive networks\nand condenses them into several methods, thus creating a comprehensive\nframework by selectively utilizing appropriate combinations.Furthermore, we\nintegrate this framework with a reconstruction-based approach and concurrently\npropose a split training strategy that alleviates the issue of overfitting\nwhile avoiding introducing interference to the reconstruction process. The\nevaluations conducted on the MVTec anomaly detection dataset demonstrate that\nour method outperforms the previous state-of-the-art approach, particularly in\nterms of object classes. To evaluate generalizability, we generate a simulated\ndataset comprising anomalies with diverse characteristics since the original\ntest samples only include specific types of anomalies and may lead to biased\nevaluations. Experimental results demonstrate that our approach exhibits\npromising potential for generalizing effectively to various unforeseen\nanomalies encountered in real-world scenarios.\n","authors":["Jiang Lin","Yaping Yan"],"pdf_url":"https://arxiv.org/pdf/2308.15068v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00147v1","updated":"2023-08-31T21:42:54Z","published":"2023-08-31T21:42:54Z","title":"Optimized Deep Feature Selection for Pneumonia Detection: A Novel RegNet\n and XOR-Based PSO Approach","summary":" Pneumonia remains a significant cause of child mortality, particularly in\ndeveloping countries where resources and expertise are limited. The automated\ndetection of Pneumonia can greatly assist in addressing this challenge. In this\nresearch, an XOR based Particle Swarm Optimization (PSO) is proposed to select\ndeep features from the second last layer of a RegNet model, aiming to improve\nthe accuracy of the CNN model on Pneumonia detection. The proposed XOR PSO\nalgorithm offers simplicity by incorporating just one hyperparameter for\ninitialization, and each iteration requires minimal computation time. Moreover,\nit achieves a balance between exploration and exploitation, leading to\nconvergence on a suitable solution. By extracting 163 features, an impressive\naccuracy level of 98% was attained which demonstrates comparable accuracy to\nprevious PSO-based methods. The source code of the proposed method is available\nin the GitHub repository.\n","authors":["Fatemehsadat Ghanadi Ladani","Samaneh Hosseini Semnani"],"pdf_url":"https://arxiv.org/pdf/2309.00147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00143v1","updated":"2023-08-31T21:28:46Z","published":"2023-08-31T21:28:46Z","title":"Self-supervised Semantic Segmentation: Consistency over Transformation","summary":" Accurate medical image segmentation is of utmost importance for enabling\nautomated clinical decision procedures. However, prevailing supervised deep\nlearning approaches for medical image segmentation encounter significant\nchallenges due to their heavy dependence on extensive labeled training data. To\ntackle this issue, we propose a novel self-supervised algorithm,\n\\textbf{S$^3$-Net}, which integrates a robust framework based on the proposed\nInception Large Kernel Attention (I-LKA) modules. This architectural\nenhancement makes it possible to comprehensively capture contextual information\nwhile preserving local intricacies, thereby enabling precise semantic\nsegmentation. Furthermore, considering that lesions in medical images often\nexhibit deformations, we leverage deformable convolution as an integral\ncomponent to effectively capture and delineate lesion deformations for superior\nobject boundary definition. Additionally, our self-supervised strategy\nemphasizes the acquisition of invariance to affine transformations, which is\ncommonly encountered in medical scenarios. This emphasis on robustness with\nrespect to geometric distortions significantly enhances the model's ability to\naccurately model and handle such distortions. To enforce spatial consistency\nand promote the grouping of spatially connected image pixels with similar\nfeature representations, we introduce a spatial consistency loss term. This\naids the network in effectively capturing the relationships among neighboring\npixels and enhancing the overall segmentation quality. The S$^3$-Net approach\niteratively learns pixel-level feature representations for image content\nclustering in an end-to-end manner. Our experimental results on skin lesion and\nlung organ segmentation tasks show the superior performance of our method\ncompared to the SOTA approaches. https://github.com/mindflow-institue/SSCT\n","authors":["Sanaz Karimijafarbigloo","Reza Azad","Amirhossein Kazerouni","Yury Velichko","Ulas Bagci","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2309.00143v1.pdf","comment":"Accepted in ICCV 2023 workshop CVAMD"},{"id":"http://arxiv.org/abs/2309.00140v1","updated":"2023-08-31T21:25:57Z","published":"2023-08-31T21:25:57Z","title":"Improving vision-inspired keyword spotting using dynamic module skipping\n in streaming conformer encoder","summary":" Using a vision-inspired keyword spotting framework, we propose an\narchitecture with input-dependent dynamic depth capable of processing streaming\naudio. Specifically, we extend a conformer encoder with trainable binary gates\nthat allow us to dynamically skip network modules according to the input audio.\nOur approach improves detection and localization accuracy on continuous speech\nusing Librispeech top-1000 most frequent words while maintaining a small memory\nfootprint. The inclusion of gates also reduces the average amount of processing\nwithout affecting the overall performance. These benefits are shown to be even\nmore pronounced using the Google speech commands dataset placed over background\nnoise where up to 97% of the processing is skipped on non-speech inputs,\ntherefore making our method particularly interesting for an always-on keyword\nspotter.\n","authors":["Alexandre Bittar","Paul Dixon","Mohammad Samragh","Kumari Nishu","Devang Naik"],"pdf_url":"https://arxiv.org/pdf/2309.00140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09941v2","updated":"2023-08-31T21:06:46Z","published":"2023-04-19T19:35:25Z","title":"A Robust and Interpretable Deep Learning Framework for Multi-modal\n Registration via Keypoints","summary":" We present KeyMorph, a deep learning-based image registration framework that\nrelies on automatically detecting corresponding keypoints. State-of-the-art\ndeep learning methods for registration often are not robust to large\nmisalignments, are not interpretable, and do not incorporate the symmetries of\nthe problem. In addition, most models produce only a single prediction at\ntest-time. Our core insight which addresses these shortcomings is that\ncorresponding keypoints between images can be used to obtain the optimal\ntransformation via a differentiable closed-form expression. We use this\nobservation to drive the end-to-end learning of keypoints tailored for the\nregistration task, and without knowledge of ground-truth keypoints. This\nframework not only leads to substantially more robust registration but also\nyields better interpretability, since the keypoints reveal which parts of the\nimage are driving the final alignment. Moreover, KeyMorph can be designed to be\nequivariant under image translations and/or symmetric with respect to the input\nimage ordering. Finally, we show how multiple deformation fields can be\ncomputed efficiently and in closed-form at test time corresponding to different\ntransformation variants. We demonstrate the proposed framework in solving 3D\naffine and spline-based registration of multi-modal brain MRI scans. In\nparticular, we show registration accuracy that surpasses current\nstate-of-the-art methods, especially in the context of large displacements. Our\ncode is available at https://github.com/alanqrwang/keymorph.\n","authors":["Alan Q. Wang","Evan M. Yu","Adrian V. Dalca","Mert R. Sabuncu"],"pdf_url":"https://arxiv.org/pdf/2304.09941v2.pdf","comment":"Accepted to Medical Image Analysis 2023"},{"id":"http://arxiv.org/abs/2309.00133v1","updated":"2023-08-31T21:02:25Z","published":"2023-08-31T21:02:25Z","title":"Distraction-free Embeddings for Robust VQA","summary":" The generation of effective latent representations and their subsequent\nrefinement to incorporate precise information is an essential prerequisite for\nVision-Language Understanding (VLU) tasks such as Video Question Answering\n(VQA). However, most existing methods for VLU focus on sparsely sampling or\nfine-graining the input information (e.g., sampling a sparse set of frames or\ntext tokens), or adding external knowledge. We present a novel \"DRAX:\nDistraction Removal and Attended Cross-Alignment\" method to rid our cross-modal\nrepresentations of distractors in the latent space. We do not exclusively\nconfine the perception of any input information from various modalities but\ninstead use an attention-guided distraction removal method to increase focus on\ntask-relevant information in latent embeddings. DRAX also ensures semantic\nalignment of embeddings during cross-modal fusions. We evaluate our approach on\na challenging benchmark (SUTD-TrafficQA dataset), testing the framework's\nabilities for feature and event queries, temporal relation understanding,\nforecasting, hypothesis, and causal analysis through extensive experiments.\n","authors":["Atharvan Dogra","Deeksha Varshney","Ashwin Kalyan","Ameet Deshpande","Neeraj Kumar"],"pdf_url":"https://arxiv.org/pdf/2309.00133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04476v3","updated":"2023-08-31T20:52:06Z","published":"2023-02-09T07:39:02Z","title":"Towards Geospatial Foundation Models via Continual Pretraining","summary":" Geospatial technologies are becoming increasingly essential in our world for\na wide range of applications, including agriculture, urban planning, and\ndisaster response. To help improve the applicability and performance of deep\nlearning models on these geospatial tasks, various works have begun\ninvestigating foundation models for this domain. Researchers have explored two\nprominent approaches for introducing such models in geospatial applications,\nbut both have drawbacks in terms of limited performance benefit or prohibitive\ntraining cost. Therefore, in this work, we propose a novel paradigm for\nbuilding highly effective geospatial foundation models with minimal resource\ncost and carbon impact. We first construct a compact yet diverse dataset from\nmultiple sources to promote feature diversity, which we term GeoPile. Then, we\ninvestigate the potential of continual pretraining from large-scale\nImageNet-22k models and propose a multi-objective continual pretraining\nparadigm, which leverages the strong representations of ImageNet while\nsimultaneously providing the freedom to learn valuable in-domain features. Our\napproach outperforms previous state-of-the-art geospatial pretraining methods\nin an extensive evaluation on seven downstream datasets covering various tasks\nsuch as change detection, classification, multi-label classification, semantic\nsegmentation, and super-resolution.\n","authors":["Matias Mendieta","Boran Han","Xingjian Shi","Yi Zhu","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2302.04476v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.00123v1","updated":"2023-08-31T20:24:14Z","published":"2023-08-31T20:24:14Z","title":"Segmentação e contagem de troncos de madeira utilizando deep\n learning e processamento de imagens","summary":" Counting objects in images is a pattern recognition problem that focuses on\nidentifying an element to determine its incidence and is approached in the\nliterature as Visual Object Counting (VOC). In this work, we propose a\nmethodology to count wood logs. First, wood logs are segmented from the image\nbackground. This first segmentation step is obtained using the Pix2Pix\nframework that implements Conditional Generative Adversarial Networks (CGANs).\nSecond, the clusters are counted using Connected Components. The average\naccuracy of the segmentation exceeds 89% while the average amount of wood logs\nidentified based on total accounted is over 97%.\n","authors":["João V. C. Mazzochin","Gustavo Tiecker","Erick O. Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2309.00123v1.pdf","comment":"in Portuguese language, International Conference on Production\n Engineering - Americas 2022"},{"id":"http://arxiv.org/abs/2309.00121v1","updated":"2023-08-31T20:21:12Z","published":"2023-08-31T20:21:12Z","title":"Beyond Self-Attention: Deformable Large Kernel Attention for Medical\n Image Segmentation","summary":" Medical image segmentation has seen significant improvements with transformer\nmodels, which excel in grasping far-reaching contexts and global contextual\ninformation. However, the increasing computational demands of these models,\nproportional to the squared token count, limit their depth and resolution\ncapabilities. Most current methods process D volumetric image data\nslice-by-slice (called pseudo 3D), missing crucial inter-slice information and\nthus reducing the model's overall performance. To address these challenges, we\nintroduce the concept of \\textbf{Deformable Large Kernel Attention (D-LKA\nAttention)}, a streamlined attention mechanism employing large convolution\nkernels to fully appreciate volumetric context. This mechanism operates within\na receptive field akin to self-attention while sidestepping the computational\noverhead. Additionally, our proposed attention mechanism benefits from\ndeformable convolutions to flexibly warp the sampling grid, enabling the model\nto adapt appropriately to diverse data patterns. We designed both 2D and 3D\nadaptations of the D-LKA Attention, with the latter excelling in cross-depth\ndata understanding. Together, these components shape our novel hierarchical\nVision Transformer architecture, the \\textit{D-LKA Net}. Evaluations of our\nmodel against leading methods on popular medical segmentation datasets\n(Synapse, NIH Pancreas, and Skin lesion) demonstrate its superior performance.\nOur code implementation is publicly available at the:\nhttps://github.com/mindflow-institue/deformableLKA\n","authors":["Reza Azad","Leon Niggemeier","Michael Huttemann","Amirhossein Kazerouni","Ehsan Khodapanah Aghdam","Yury Velichko","Ulas Bagci","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2309.00121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00108v1","updated":"2023-08-31T19:56:14Z","published":"2023-08-31T19:56:14Z","title":"Laplacian-Former: Overcoming the Limitations of Vision Transformers in\n Local Texture Detection","summary":" Vision Transformer (ViT) models have demonstrated a breakthrough in a wide\nrange of computer vision tasks. However, compared to the Convolutional Neural\nNetwork (CNN) models, it has been observed that the ViT models struggle to\ncapture high-frequency components of images, which can limit their ability to\ndetect local textures and edge information. As abnormalities in human tissue,\nsuch as tumors and lesions, may greatly vary in structure, texture, and shape,\nhigh-frequency information such as texture is crucial for effective semantic\nsegmentation tasks. To address this limitation in ViT models, we propose a new\ntechnique, Laplacian-Former, that enhances the self-attention map by adaptively\nre-calibrating the frequency information in a Laplacian pyramid. More\nspecifically, our proposed method utilizes a dual attention mechanism via\nefficient attention and frequency attention while the efficient attention\nmechanism reduces the complexity of self-attention to linear while producing\nthe same output, selectively intensifying the contribution of shape and texture\nfeatures. Furthermore, we introduce a novel efficient enhancement multi-scale\nbridge that effectively transfers spatial information from the encoder to the\ndecoder while preserving the fundamental features. We demonstrate the efficacy\nof Laplacian-former on multi-organ and skin lesion segmentation tasks with\n+1.87\\% and +0.76\\% dice scores compared to SOTA approaches, respectively. Our\nimplementation is publically available at\nhttps://github.com/mindflow-institue/Laplacian-Former\n","authors":["Reza Azad","Amirhossein Kazerouni","Babak Azad","Ehsan Khodapanah Aghdam","Yury Velichko","Ulas Bagci","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2309.00108v1.pdf","comment":"Accepted in the main conference MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.00107v1","updated":"2023-08-31T19:55:50Z","published":"2023-08-31T19:55:50Z","title":"Unsupervised evaluation of GAN sample quality: Introducing the TTJac\n Score","summary":" Evaluation metrics are essential for assessing the performance of generative\nmodels in image synthesis. However, existing metrics often involve high memory\nand time consumption as they compute the distance between generated samples and\nreal data points. In our study, the new evaluation metric called the \"TTJac\nscore\" is proposed to measure the fidelity of individual synthesized images in\na data-free manner. The study first establishes a theoretical approach to\ndirectly evaluate the generated sample density. Then, a method incorporating\nfeature extractors and discrete function approximation through tensor train is\nintroduced to effectively assess the quality of generated samples. Furthermore,\nthe study demonstrates that this new metric can be used to improve the\nfidelity-variability trade-off when applying the truncation trick. The\nexperimental results of applying the proposed metric to StyleGAN 2 and StyleGAN\n2 ADA models on FFHQ, AFHQ-Wild, LSUN-Cars, and LSUN-Horse datasets are\npresented. The code used in this research will be made publicly available\nonline for the research community to access and utilize.\n","authors":["Egor Sevriugov","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2309.00107v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.12114v2","updated":"2023-08-31T19:43:17Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Model sparsification in deep learning promotes simpler, more interpretable\nmodels with fewer parameters. This not only reduces the model's memory\nfootprint and computational needs but also shortens inference time. This work\nfocuses on creating sparse models optimized for multiple tasks with fewer\nparameters. These parsimonious models also possess the potential to match or\noutperform dense models in terms of performance. In this work, we introduce\nchannel-wise l1/l2 group sparsity in the shared convolutional layers parameters\n(or weights) of the multi-task learning model. This approach facilitates the\nremoval of extraneous groups i.e., channels (due to l1 regularization) and also\nimposes a penalty on the weights, further enhancing the learning efficiency for\nall tasks (due to l2 regularization). We analyzed the results of group sparsity\nin both single-task and multi-task settings on two widely-used Multi-Task\nLearning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which\nconsist of three different computer vision tasks each, multi-task models with\napproximately 70% sparsity outperform their dense equivalents. We also\ninvestigate how changing the degree of sparsification influences the model's\nperformance, the overall sparsity percentage, the patterns of sparsity, and the\ninference time.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.00096v1","updated":"2023-08-31T19:34:09Z","published":"2023-08-31T19:34:09Z","title":"Open-Vocabulary Semantic Segmentation via Attribute\n Decomposition-Aggregation","summary":" Open-vocabulary semantic segmentation is a challenging task that requires\nsegmenting novel object categories at inference time. Recent works explore\nvision-language pre-training to handle this task, but suffer from unrealistic\nassumptions in practical scenarios, i.e., low-quality textual category names.\nFor example, this paradigm assumes that new textual categories will be\naccurately and completely provided, and exist in lexicons during pre-training.\nHowever, exceptions often happen when meet with ambiguity for brief or\nincomplete names, new words that are not present in the pre-trained lexicons,\nand difficult-to-describe categories for users. To address these issues, this\nwork proposes a novel decomposition-aggregation framework, inspired by human\ncognition in understanding new concepts. Specifically, in the decomposition\nstage, we decouple class names into diverse attribute descriptions to enrich\nsemantic contexts. Two attribute construction strategies are designed: using\nlarge language models for common categories, and involving manually labelling\nfor human-invented categories. In the aggregation stage, we group diverse\nattributes into an integrated global description, to form a discriminative\nclassifier that distinguishes the target object from others. One hierarchical\naggregation is further designed to achieve multi-level alignment and deep\nfusion between vision and text. The final result is obtained by computing the\nembedding similarity between aggregated attributes and images. To evaluate the\neffectiveness, we annotate three datasets with attribute descriptions, and\nconduct extensive experiments and ablation studies. The results show the\nsuperior performance of attribute decomposition-aggregation.\n","authors":["Chaofan Ma","Yuhuan Yang","Chen Ju","Fei Zhang","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00081v1","updated":"2023-08-31T18:38:57Z","published":"2023-08-31T18:38:57Z","title":"Few-shot Diagnosis of Chest x-rays Using an Ensemble of Random\n Discriminative Subspaces","summary":" Due to the scarcity of annotated data in the medical domain, few-shot\nlearning may be useful for medical image analysis tasks. We design a few-shot\nlearning method using an ensemble of random subspaces for the diagnosis of\nchest x-rays (CXRs). Our design is computationally efficient and almost 1.8\ntimes faster than method that uses the popular truncated singular value\ndecomposition (t-SVD) for subspace decomposition. The proposed method is\ntrained by minimizing a novel loss function that helps create well-separated\nclusters of training data in discriminative subspaces. As a result, minimizing\nthe loss maximizes the distance between the subspaces, making them\ndiscriminative and assisting in better classification. Experiments on\nlarge-scale publicly available CXR datasets yield promising results. Code for\nthe project will be available at\nhttps://github.com/Few-shot-Learning-on-chest-x-ray/fsl_subspace.\n","authors":[" Kshitiz","Garvit Garg","Angshuman Paul"],"pdf_url":"https://arxiv.org/pdf/2309.00081v1.pdf","comment":"ICLR MLGH Workshop 2023"},{"id":"http://arxiv.org/abs/2308.11696v2","updated":"2023-08-31T18:18:03Z","published":"2023-08-22T17:59:30Z","title":"Efficient Benchmarking (of Language Models)","summary":" The increasing versatility of language models LMs has given rise to a new\nclass of benchmarks that comprehensively assess a broad range of capabilities.\nSuch benchmarks are associated with massive computational costs reaching\nthousands of GPU hours per model. However the efficiency aspect of these\nevaluation efforts had raised little discussion in the literature. In this work\nwe present the problem of Efficient Benchmarking namely intelligently reducing\nthe computation costs of LM evaluation without compromising reliability. Using\nthe HELM benchmark as a test case we investigate how different benchmark design\nchoices affect the computation-reliability tradeoff. We propose to evaluate the\nreliability of such decisions by using a new measure Decision Impact on\nReliability DIoR for short. We find for example that the current leader on HELM\nmay change by merely removing a low-ranked model from the benchmark and observe\nthat a handful of examples suffice to obtain the correct benchmark ranking.\nConversely a slightly different choice of HELM scenarios varies ranking widely.\nBased on our findings we outline a set of concrete recommendations for more\nefficient benchmark design and utilization practices leading to dramatic cost\nsavings with minimal loss of benchmark reliability often reducing computation\nby x100 or more.\n","authors":["Yotam Perlitz","Elron Bandel","Ariel Gera","Ofir Arviv","Liat Ein-Dor","Eyal Shnarch","Noam Slonim","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2308.11696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00066v1","updated":"2023-08-31T18:13:01Z","published":"2023-08-31T18:13:01Z","title":"SoDaCam: Software-defined Cameras via Single-Photon Imaging","summary":" Reinterpretable cameras are defined by their post-processing capabilities\nthat exceed traditional imaging. We present \"SoDaCam\" that provides\nreinterpretable cameras at the granularity of photons, from photon-cubes\nacquired by single-photon devices. Photon-cubes represent the spatio-temporal\ndetections of photons as a sequence of binary frames, at frame-rates as high as\n100 kHz. We show that simple transformations of the photon-cube, or photon-cube\nprojections, provide the functionality of numerous imaging systems including:\nexposure bracketing, flutter shutter cameras, video compressive systems, event\ncameras, and even cameras that move during exposure. Our photon-cube\nprojections offer the flexibility of being software-defined constructs that are\nonly limited by what is computable, and shot-noise. We exploit this flexibility\nto provide new capabilities for the emulated cameras. As an added benefit, our\nprojections provide camera-dependent compression of photon-cubes, which we\ndemonstrate using an implementation of our projections on a novel compute\narchitecture that is designed for single-photon imaging.\n","authors":["Varun Sundar","Andrei Ardelean","Tristan Swedish","Claudio Brusschini","Edoardo Charbon","Mohit Gupta"],"pdf_url":"https://arxiv.org/pdf/2309.00066v1.pdf","comment":"Accepted at ICCV 2023 (oral). Project webpage can be found at\n https://wisionlab.com/project/sodacam/"},{"id":"http://arxiv.org/abs/2309.00059v1","updated":"2023-08-31T18:04:50Z","published":"2023-08-31T18:04:50Z","title":"STint: Self-supervised Temporal Interpolation for Geospatial Data","summary":" Supervised and unsupervised techniques have demonstrated the potential for\ntemporal interpolation of video data. Nevertheless, most prevailing temporal\ninterpolation techniques hinge on optical flow, which encodes the motion of\npixels between video frames. On the other hand, geospatial data exhibits lower\ntemporal resolution while encompassing a spectrum of movements and deformations\nthat challenge several assumptions inherent to optical flow. In this work, we\npropose an unsupervised temporal interpolation technique, which does not rely\non ground truth data or require any motion information like optical flow, thus\noffering a promising alternative for better generalization across geospatial\ndomains. Specifically, we introduce a self-supervised technique of dual cycle\nconsistency. Our proposed technique incorporates multiple cycle consistency\nlosses, which result from interpolating two frames between consecutive input\nframes through a series of stages. This dual cycle consistent constraint causes\nthe model to produce intermediate frames in a self-supervised manner. To the\nbest of our knowledge, this is the first attempt at unsupervised temporal\ninterpolation without the explicit use of optical flow. Our experimental\nevaluations across diverse geospatial datasets show that STint significantly\noutperforms existing state-of-the-art methods for unsupervised temporal\ninterpolation.\n","authors":["Nidhin Harilal","Bri-Mathias Hodge","Aneesh Subramanian","Claire Monteleoni"],"pdf_url":"https://arxiv.org/pdf/2309.00059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00058v1","updated":"2023-08-31T18:04:09Z","published":"2023-08-31T18:04:09Z","title":"Bellybutton: Accessible and Customizable Deep-Learning Image\n Segmentation","summary":" The conversion of raw images into quantifiable data can be a major hurdle in\nexperimental research, and typically involves identifying region(s) of\ninterest, a process known as segmentation. Machine learning tools for image\nsegmentation are often specific to a set of tasks, such as tracking cells, or\nrequire substantial compute or coding knowledge to train and use. Here we\nintroduce an easy-to-use (no coding required), image segmentation method, using\na 15-layer convolutional neural network that can be trained on a laptop:\nBellybutton. The algorithm trains on user-provided segmentation of example\nimages, but, as we show, just one or even a portion of one training image can\nbe sufficient in some cases. We detail the machine learning method and give\nthree use cases where Bellybutton correctly segments images despite substantial\nlighting, shape, size, focus, and/or structure variation across the regions(s)\nof interest. Instructions for easy download and use, with further details and\nthe datasets used in this paper are available at\npypi.org/project/Bellybuttonseg.\n","authors":["Sam Dillavou","Jesse M. Hanlan","Anthony T. Chieco","Hongyi Xiao","Sage Fulco","Kevin T. Turner","Douglas J. Durian"],"pdf_url":"https://arxiv.org/pdf/2309.00058v1.pdf","comment":"6 Pages 3 Figures"},{"id":"http://arxiv.org/abs/2309.00035v1","updated":"2023-08-31T17:59:48Z","published":"2023-08-31T17:59:48Z","title":"FACET: Fairness in Computer Vision Evaluation Benchmark","summary":" Computer vision models have known performance disparities across attributes\nsuch as gender and skin tone. This means during tasks such as classification\nand detection, model performance differs for certain classes based on the\ndemographics of the people in the image. These disparities have been shown to\nexist, but until now there has not been a unified approach to measure these\ndifferences for common use-cases of computer vision models. We present a new\nbenchmark named FACET (FAirness in Computer Vision EvaluaTion), a large,\npublicly available evaluation set of 32k images for some of the most common\nvision tasks - image classification, object detection and segmentation. For\nevery image in FACET, we hired expert reviewers to manually annotate\nperson-related attributes such as perceived skin tone and hair type, manually\ndraw bounding boxes and label fine-grained person-related classes such as disk\njockey or guitarist. In addition, we use FACET to benchmark state-of-the-art\nvision models and present a deeper understanding of potential performance\ndisparities and challenges across sensitive demographic attributes. With the\nexhaustive annotations collected, we probe models using single demographics\nattributes as well as multiple attributes using an intersectional approach\n(e.g. hair color and perceived skin tone). Our results show that\nclassification, detection, segmentation, and visual grounding models exhibit\nperformance disparities across demographic attributes and intersections of\nattributes. These harms suggest that not all people represented in datasets\nreceive fair and equitable treatment in these vision tasks. We hope current and\nfuture results using our benchmark will contribute to fairer, more robust\nvision models. FACET is available publicly at https://facet.metademolab.com/\n","authors":["Laura Gustafson","Chloe Rolland","Nikhila Ravi","Quentin Duval","Aaron Adcock","Cheng-Yang Fu","Melissa Hall","Candace Ross"],"pdf_url":"https://arxiv.org/pdf/2309.00035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00030v1","updated":"2023-08-31T15:41:40Z","published":"2023-08-31T15:41:40Z","title":"Audio-Driven Dubbing for User Generated Contents via Style-Aware\n Semi-Parametric Synthesis","summary":" Existing automated dubbing methods are usually designed for Professionally\nGenerated Content (PGC) production, which requires massive training data and\ntraining time to learn a person-specific audio-video mapping. In this paper, we\ninvestigate an audio-driven dubbing method that is more feasible for User\nGenerated Content (UGC) production. There are two unique challenges to design a\nmethod for UGC: 1) the appearances of speakers are diverse and arbitrary as the\nmethod needs to generalize across users; 2) the available video data of one\nspeaker are very limited. In order to tackle the above challenges, we first\nintroduce a new Style Translation Network to integrate the speaking style of\nthe target and the speaking content of the source via a cross-modal AdaIN\nmodule. It enables our model to quickly adapt to a new speaker. Then, we\nfurther develop a semi-parametric video renderer, which takes full advantage of\nthe limited training data of the unseen speaker via a video-level\nretrieve-warp-refine pipeline. Finally, we propose a temporal regularization\nfor the semi-parametric renderer, generating more continuous videos. Extensive\nexperiments show that our method generates videos that accurately preserve\nvarious speaking styles, yet with considerably lower amount of training data\nand training time in comparison to existing methods. Besides, our method\nachieves a faster testing speed than most recent methods.\n","authors":["Linsen Song","Wayne Wu","Chaoyou Fu","Chen Change Loy","Ran He"],"pdf_url":"https://arxiv.org/pdf/2309.00030v1.pdf","comment":"TCSVT 2022"},{"id":"http://arxiv.org/abs/2309.00028v1","updated":"2023-08-31T14:58:11Z","published":"2023-08-31T14:58:11Z","title":"Vision-Based Cranberry Crop Ripening Assessment","summary":" Agricultural domains are being transformed by recent advances in AI and\ncomputer vision that support quantitative visual evaluation. Using drone\nimaging, we develop a framework for characterizing the ripening process of\ncranberry crops. Our method consists of drone-based time-series collection over\na cranberry growing season, photometric calibration for albedo recovery from\npixels, and berry segmentation with semi-supervised deep learning networks\nusing point-click annotations. By extracting time-series berry albedo\nmeasurements, we evaluate four different varieties of cranberries and provide a\nquantification of their ripening rates. Such quantification has practical\nimplications for 1) assessing real-time overheating risks for cranberry bogs;\n2) large scale comparisons of progeny in crop breeding; 3) detecting disease by\nlooking for ripening pattern outliers. This work is the first of its kind in\nquantitative evaluation of ripening using computer vision methods and has\nimpact beyond cranberry crops including wine grapes, olives, blueberries, and\nmaize.\n","authors":["Faith Johnson","Jack Lowry","Kristin Dana","Peter Oudemans"],"pdf_url":"https://arxiv.org/pdf/2309.00028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00027v1","updated":"2023-08-31T13:47:01Z","published":"2023-08-31T13:47:01Z","title":"A Sequential Framework for Detection and Classification of Abnormal\n Teeth in Panoramic X-rays","summary":" This paper describes our solution for the Dental Enumeration and Diagnosis on\nPanoramic X-rays Challenge at MICCAI 2023. Our approach consists of a\nmulti-step framework tailored to the task of detecting and classifying abnormal\nteeth. The solution includes three sequential stages: dental instance\ndetection, healthy instance filtering, and abnormal instance classification. In\nthe first stage, we employed a Faster-RCNN model for detecting and identifying\nteeth. In subsequent stages, we designed a model that merged the encoding\npathway of a pretrained U-net, optimized for dental lesion detection, with the\nVgg16 architecture. The resulting model was first used for filtering out\nhealthy teeth. Then, any identified abnormal teeth were categorized,\npotentially falling into one or more of the following conditions: embeddded,\nperiapical lesion, caries, deep caries. The model performing dental instance\ndetection achieved an AP score of 0.49. The model responsible for identifying\nhealthy teeth attained an F1 score of 0.71. Meanwhile, the model trained for\nmulti-label dental disease classification achieved an F1 score of 0.76. The\ncode is available at\nhttps://github.com/tudordascalu/2d-teeth-detection-challenge.\n","authors":["Tudor Dascalu","Shaqayeq Ramezanzade","Azam Bakhshandeh","Lars Bjorndal","Bulat Ibragimov"],"pdf_url":"https://arxiv.org/pdf/2309.00027v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.16761v1","updated":"2023-08-31T14:29:10Z","published":"2023-08-31T14:29:10Z","title":"Co-evolving Vector Quantization for ID-based Recommendation","summary":" Category information plays a crucial role in enhancing the quality and\npersonalization of recommendations. Nevertheless, the availability of item\ncategory information is not consistently present, particularly in the context\nof ID-based recommendations. In this work, we propose an alternative approach\nto automatically learn and generate entity (i.e., user and item) categorical\ninformation at different levels of granularity, specifically for ID-based\nrecommendation. Specifically, we devise a co-evolving vector quantization\nframework, namely COVE, which enables the simultaneous learning and refinement\nof code representation and entity embedding in an end-to-end manner, starting\nfrom the randomly initialized states. With its high adaptability, COVE can be\neasily integrated into existing recommendation models. We validate the\neffectiveness of COVE on various recommendation tasks including list\ncompletion, collaborative filtering, and click-through rate prediction, across\ndifferent recommendation models. We will publish the code and data for other\nresearchers to reproduce our work.\n","authors":["Qijiong Liu","Jiaren Xiao","Lu Fan","Jieming Zhu","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.16761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16753v1","updated":"2023-08-31T14:19:50Z","published":"2023-08-31T14:19:50Z","title":"Context Aware Query Rewriting for Text Rankers using LLM","summary":" Query rewriting refers to an established family of approaches that are\napplied to underspecified and ambiguous queries to overcome the vocabulary\nmismatch problem in document ranking. Queries are typically rewritten during\nquery processing time for better query modelling for the downstream ranker.\nWith the advent of large-language models (LLMs), there have been initial\ninvestigations into using generative approaches to generate pseudo documents to\ntackle this inherent vocabulary gap. In this work, we analyze the utility of\nLLMs for improved query rewriting for text ranking tasks. We find that there\nare two inherent limitations of using LLMs as query re-writers -- concept drift\nwhen using only queries as prompts and large inference costs during query\nprocessing. We adopt a simple, yet surprisingly effective, approach called\ncontext aware query rewriting (CAR) to leverage the benefits of LLMs for query\nunderstanding. Firstly, we rewrite ambiguous training queries by context-aware\nprompting of LLMs, where we use only relevant documents as context.Unlike\nexisting approaches, we use LLM-based query rewriting only during the training\nphase. Eventually, a ranker is fine-tuned on the rewritten queries instead of\nthe original queries during training. In our extensive experiments, we find\nthat fine-tuning a ranker using re-written queries offers a significant\nimprovement of up to 33% on the passage ranking task and up to 28% on the\ndocument ranking task when compared to the baseline performance of using\noriginal queries.\n","authors":["Abhijit Anand","Venktesh V","Vinay Setty","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2308.16753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14276v2","updated":"2023-08-31T14:05:51Z","published":"2023-08-28T03:15:37Z","title":"Alleviating Video-Length Effect for Micro-video Recommendation","summary":" Micro-videos platforms such as TikTok are extremely popular nowadays. One\nimportant feature is that users no longer select interested videos from a set,\ninstead they either watch the recommended video or skip to the next one. As a\nresult, the time length of users' watching behavior becomes the most important\nsignal for identifying preferences. However, our empirical data analysis has\nshown a video-length effect that long videos are easier to receive a higher\nvalue of average view time, thus adopting such view-time labels for measuring\nuser preferences can easily induce a biased model that favors the longer\nvideos. In this paper, we propose a Video Length Debiasing Recommendation\n(VLDRec) method to alleviate such an effect for micro-video recommendation.\nVLDRec designs the data labeling approach and the sample generation module that\nbetter capture user preferences in a view-time oriented manner. It further\nleverages the multi-task learning technique to jointly optimize the above\nsamples with original biased ones. Extensive experiments show that VLDRec can\nimprove the users' view time by 1.81% and 11.32% on two real-world datasets,\ngiven a recommendation list of a fixed overall video length, compared with the\nbest baseline method. Moreover, VLDRec is also more effective in matching\nusers' interests in terms of the video content.\n","authors":["Yuhan Quan","Jingtao Ding","Chen Gao","Nian Li","Lingling Yi","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2308.14276v2.pdf","comment":"Accept by TOIS"},{"id":"http://arxiv.org/abs/2305.06566v4","updated":"2023-08-31T13:43:43Z","published":"2023-05-11T04:51:21Z","title":"ONCE: Boosting Content-based Recommendation with Both Open- and\n Closed-source Large Language Models","summary":" Personalized content-based recommender systems have become indispensable\ntools for users to navigate through the vast amount of content available on\nplatforms like daily news websites and book recommendation services. However,\nexisting recommenders face significant challenges in understanding the content\nof items. Large language models (LLMs), which possess deep semantic\ncomprehension and extensive knowledge from pretraining, have proven to be\neffective in various natural language processing tasks. In this study, we\nexplore the potential of leveraging both open- and closed-source LLMs to\nenhance content-based recommendation. With open-source LLMs, we utilize their\ndeep layers as content encoders, enriching the representation of content at the\nembedding level. For closed-source LLMs, we employ prompting techniques to\nenrich the training data at the token level. Through comprehensive experiments,\nwe demonstrate the high effectiveness of both types of LLMs and show the\nsynergistic relationship between them. Notably, we observed a significant\nrelative improvement of up to 19.32% compared to existing state-of-the-art\nrecommendation models. These findings highlight the immense potential of both\nopen- and closed-source of LLMs in enhancing content-based recommendation\nsystems. We will make our code and LLM-generated data available for other\nresearchers to reproduce our results.\n","authors":["Qijiong Liu","Nuo Chen","Tetsuya Sakai","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2305.06566v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16708v1","updated":"2023-08-31T13:24:57Z","published":"2023-08-31T13:24:57Z","title":"Concentrating on the Impact: Consequence-based Explanations in\n Recommender Systems","summary":" Recommender systems assist users in decision-making, where the presentation\nof recommended items and their explanations are critical factors for enhancing\nthe overall user experience. Although various methods for generating\nexplanations have been proposed, there is still room for improvement,\nparticularly for users who lack expertise in a specific item domain. In this\nstudy, we introduce the novel concept of \\textit{consequence-based\nexplanations}, a type of explanation that emphasizes the individual impact of\nconsuming a recommended item on the user, which makes the effect of following\nrecommendations clearer. We conducted an online user study to examine our\nassumption about the appreciation of consequence-based explanations and their\nimpacts on different explanation aims in recommender systems. Our findings\nhighlight the importance of consequence-based explanations, which were\nwell-received by users and effectively improved user satisfaction in\nrecommender systems. These results provide valuable insights for designing\nengaging explanations that can enhance the overall user experience in\ndecision-making.\n","authors":["Sebastian Lubos","Thi Ngoc Trang Tran","Seda Polat Erdeniz","Merfat El Mansi","Alexander Felfernig","Manfred Wundara","Gerhard Leitner"],"pdf_url":"https://arxiv.org/pdf/2308.16708v1.pdf","comment":"Preprint of the paper to be presented at IntRS'23: Joint Workshop on\n Interfaces and Human Decision Making for Recommender Systems, September 18,\n 2023, Singapore. paper will be published in the workshop proceedings"},{"id":"http://arxiv.org/abs/2307.15464v5","updated":"2023-08-31T12:50:59Z","published":"2023-07-28T10:34:47Z","title":"Framework to Automatically Determine the Quality of Open Data Catalogs","summary":" Data catalogs play a crucial role in modern data-driven organizations by\nfacilitating the discovery, understanding, and utilization of diverse data\nassets. However, ensuring their quality and reliability is complex, especially\nin open and large-scale data environments. This paper proposes a framework to\nautomatically determine the quality of open data catalogs, addressing the need\nfor efficient and reliable quality assessment mechanisms. Our framework can\nanalyze various core quality dimensions, such as accuracy, completeness,\nconsistency, scalability, and timeliness, offer several alternatives for the\nassessment of compatibility and similarity across such catalogs as well as the\nimplementation of a set of non-core quality dimensions such as provenance,\nreadability, and licensing. The goal is to empower data-driven organizations to\nmake informed decisions based on trustworthy and well-curated data assets. The\nsource code that illustrates our approach can be downloaded from\nhttps://www.github.com/jorge-martinez-gil/dataq/.\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2307.15464v5.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2302.07669v2","updated":"2023-08-31T11:24:15Z","published":"2023-02-15T14:06:39Z","title":"Unsupervised Hashing with Similarity Distribution Calibration","summary":" Unsupervised hashing methods typically aim to preserve the similarity between\ndata points in a feature space by mapping them to binary hash codes. However,\nthese methods often overlook the fact that the similarity between data points\nin the continuous feature space may not be preserved in the discrete hash code\nspace, due to the limited similarity range of hash codes. The similarity range\nis bounded by the code length and can lead to a problem known as similarity\ncollapse. That is, the positive and negative pairs of data points become less\ndistinguishable from each other in the hash space. To alleviate this problem,\nin this paper a novel Similarity Distribution Calibration (SDC) method is\nintroduced. SDC aligns the hash code similarity distribution towards a\ncalibration distribution (e.g., beta distribution) with sufficient spread\nacross the entire similarity range, thus alleviating the similarity collapse\nproblem. Extensive experiments show that our SDC outperforms significantly the\nstate-of-the-art alternatives on coarse category-level and instance-level image\nretrieval. Code is available at https://github.com/kamwoh/sdc.\n","authors":["Kam Woh Ng","Xiatian Zhu","Jiun Tian Hoe","Chee Seng Chan","Tianyu Zhang","Yi-Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2302.07669v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2308.16609v1","updated":"2023-08-31T10:12:32Z","published":"2023-08-31T10:12:32Z","title":"Towards Long-Tailed Recognition for Graph Classification via\n Collaborative Experts","summary":" Graph classification, aiming at learning the graph-level representations for\neffective class assignments, has received outstanding achievements, which\nheavily relies on high-quality datasets that have balanced class distribution.\nIn fact, most real-world graph data naturally presents a long-tailed form,\nwhere the head classes occupy much more samples than the tail classes, it thus\nis essential to study the graph-level classification over long-tailed data\nwhile still remaining largely unexplored. However, most existing long-tailed\nlearning methods in visions fail to jointly optimize the representation\nlearning and classifier training, as well as neglect the mining of the\nhard-to-classify classes. Directly applying existing methods to graphs may lead\nto sub-optimal performance, since the model trained on graphs would be more\nsensitive to the long-tailed distribution due to the complex topological\ncharacteristics. Hence, in this paper, we propose a novel long-tailed\ngraph-level classification framework via Collaborative Multi-expert Learning\n(CoMe) to tackle the problem. To equilibrate the contributions of head and tail\nclasses, we first develop balanced contrastive learning from the view of\nrepresentation learning, and then design an individual-expert classifier\ntraining based on hard class mining. In addition, we execute gated fusion and\ndisentangled knowledge distillation among the multiple experts to promote the\ncollaboration in a multi-expert framework. Comprehensive experiments are\nperformed on seven widely-used benchmark datasets to demonstrate the\nsuperiority of our method CoMe over state-of-the-art baselines.\n","authors":["Siyu Yi","Zhengyang Mao","Wei Ju","Yongdao Zhou","Luchen Liu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16609v1.pdf","comment":"Accepted by IEEE Transactions on Big Data (TBD 2024)"},{"id":"http://arxiv.org/abs/2308.16505v1","updated":"2023-08-31T07:36:44Z","published":"2023-08-31T07:36:44Z","title":"Recommender AI Agent: Integrating Large Language Models for Interactive\n Recommendations","summary":" Recommender models excel at providing domain-specific item recommendations by\nleveraging extensive user behavior data. Despite their ability to act as\nlightweight domain experts, they struggle to perform versatile tasks such as\nproviding explanations and engaging in conversations. On the other hand, large\nlanguage models (LLMs) represent a significant step towards artificial general\nintelligence, showcasing remarkable capabilities in instruction comprehension,\ncommonsense reasoning, and human interaction. However, LLMs lack the knowledge\nof domain-specific item catalogs and behavioral patterns, particularly in areas\nthat diverge from general world knowledge, such as online e-commerce.\nFinetuning LLMs for each domain is neither economic nor efficient.\n In this paper, we bridge the gap between recommender models and LLMs,\ncombining their respective strengths to create a versatile and interactive\nrecommender system. We introduce an efficient framework called RecAgent, which\nemploys LLMs as the brain and recommender models as tools. We first outline a\nminimal set of essential tools required to transform LLMs into RecAgent. We\nthen propose an efficient workflow within RecAgent for task execution,\nincorporating key components such as a memory bus, dynamic\ndemonstration-augmented task planning, and reflection. RecAgent enables\ntraditional recommender systems, such as those ID-based matrix factorization\nmodels, to become interactive systems with a natural language interface through\nthe integration of LLMs. Experimental results on several public datasets show\nthat RecAgent achieves satisfying performance as a conversational recommender\nsystem, outperforming general-purpose LLMs.\n","authors":["Xu Huang","Jianxun Lian","Yuxuan Lei","Jing Yao","Defu Lian","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.16505v1.pdf","comment":"16 pages, 15 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.16437v1","updated":"2023-08-31T03:52:57Z","published":"2023-08-31T03:52:57Z","title":"AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR\n Prediction","summary":" Click-through rate (CTR) prediction is a crucial issue in recommendation\nsystems. There has been an emergence of various public CTR datasets. However,\nexisting datasets primarily suffer from the following limitations. Firstly,\nusers generally click different types of items from multiple scenarios, and\nmodeling from multiple scenarios can provide a more comprehensive understanding\nof users. Existing datasets only include data for the same type of items from a\nsingle scenario. Secondly, multi-modal features are essential in multi-scenario\nprediction as they address the issue of inconsistent ID encoding between\ndifferent scenarios. The existing datasets are based on ID features and lack\nmulti-modal features. Third, a large-scale dataset can provide a more reliable\nevaluation of models, fully reflecting the performance differences between\nmodels. The scale of existing datasets is around 100 million, which is\nrelatively small compared to the real-world CTR prediction. To address these\nlimitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset\nbased on industrial data from Alipay. Specifically, AntM$^{2}$C provides the\nfollowing advantages: 1) It covers CTR data of 5 different types of items,\nproviding insights into the preferences of users for different items, including\nadvertisements, vouchers, mini-programs, contents, and videos. 2) Apart from\nID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text\nand image features, which can effectively establish connections between items\nwith different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200\nfeatures, including 200 million users and 6 million items. It is currently the\nlargest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several\ntypical CTR tasks and provide comparisons with baseline methods. The dataset\nhomepage is available at https://www.atecup.cn/home.\n","authors":["Zhaoxin Huan","Ke Ding","Ang Li","Xiaolu Zhang","Xu Min","Yong He","Liang Zhang","Jun Zhou","Linjian Mo","Jinjie Gu","Zhongyi Liu","Wenliang Zhong","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16437v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2307.04726v2","updated":"2023-08-31T17:59:49Z","published":"2023-07-10T17:34:23Z","title":"Diffusion Policies for Out-of-Distribution Generalization in Offline\n Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) methods leverage previous experiences to\nlearn better policies than the behavior policy used for data collection. In\ncontrast to behavior cloning, which assumes the data is collected from expert\ndemonstrations, offline RL can work with non-expert data and multimodal\nbehavior policies. However, offline RL algorithms face challenges in handling\ndistribution shifts and effectively representing policies due to the lack of\nonline interaction during training. Prior work on offline RL uses conditional\ndiffusion models to represent multimodal behavior in the dataset. Nevertheless,\nthese methods are not tailored toward alleviating the out-of-distribution state\ngeneralization. We introduce a novel method, named State Reconstruction for\nDiffusion Policies (SRDP), incorporating state reconstruction feature learning\nin the recent class of diffusion policies to address the out-of-distribution\ngeneralization problem. State reconstruction loss promotes more descriptive\nrepresentation learning of states to alleviate the distribution shift incurred\nby the out-of-distribution (OOD) states. We design a novel 2D Multimodal\nContextual Bandit environment to illustrate the OOD generalization of SRDP\ncompared to prior algorithms. In addition, we assess the performance of our\nmodel on D4RL continuous control benchmarks, namely the navigation of an 8-DoF\nant and forward locomotion of half-cheetah, hopper, and walker2d, achieving\nstate-of-the-art results.\n","authors":["Suzan Ece Ada","Erhan Oztop","Emre Ugur"],"pdf_url":"https://arxiv.org/pdf/2307.04726v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.16904v1","updated":"2023-08-31T17:59:00Z","published":"2023-08-31T17:59:00Z","title":"A Note on Randomized Kaczmarz Algorithm for Solving Doubly-Noisy Linear\n Systems","summary":" Large-scale linear systems, $Ax=b$, frequently arise in practice and demand\neffective iterative solvers. Often, these systems are noisy due to operational\nerrors or faulty data-collection processes. In the past decade, the randomized\nKaczmarz (RK) algorithm has been studied extensively as an efficient iterative\nsolver for such systems. However, the convergence study of RK in the noisy\nregime is limited and considers measurement noise in the right-hand side\nvector, $b$. Unfortunately, in practice, that is not always the case; the\ncoefficient matrix $A$ can also be noisy. In this paper, we analyze the\nconvergence of RK for noisy linear systems when the coefficient matrix, $A$, is\ncorrupted with both additive and multiplicative noise, along with the noisy\nvector, $b$. In our analyses, the quantity $\\tilde R=\\| \\tilde A^{\\dagger}\n\\|_2^2 \\|\\tilde A \\|_F^2$ influences the convergence of RK, where $\\tilde A$\nrepresents a noisy version of $A$. We claim that our analysis is robust and\nrealistically applicable, as we do not require information about the noiseless\ncoefficient matrix, $A$, and considering different conditions on noise, we can\ncontrol the convergence of RK. We substantiate our theoretical findings by\nperforming comprehensive numerical experiments.\n","authors":["El Houcine Bergou","Soumia Boucherouite","Aritra Dutta","Xin Li","Anna Ma"],"pdf_url":"https://arxiv.org/pdf/2308.16904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16900v1","updated":"2023-08-31T17:58:28Z","published":"2023-08-31T17:58:28Z","title":"Learning to Taste: A Multimodal Wine Dataset","summary":" We present WineSensed, a large multimodal wine dataset for studying the\nrelations between visual perception, language, and flavor. The dataset\nencompasses 897k images of wine labels and 824k reviews of wines curated from\nthe Vivino platform. It has over 350k unique vintages, annotated with year,\nregion, rating, alcohol percentage, price, and grape composition. We obtained\nfine-grained flavor annotations on a subset by conducting a wine-tasting\nexperiment with 256 participants who were asked to rank wines based on their\nsimilarity in flavor, resulting in more than 5k pairwise flavor distances. We\npropose a low-dimensional concept embedding algorithm that combines human\nexperience with automatic machine similarity kernels. We demonstrate that this\nshared concept embedding space improves upon separate embedding spaces for\ncoarse flavor classification (alcohol percentage, country, grape, price,\nrating) and aligns with the intricate human perception of flavor.\n","authors":["Thoranna Bender","Simon Møe Sørensen","Alireza Kashani","K. Eldjarn Hjorleifsson","Grethe Hyldig","Søren Hauberg","Serge Belongie","Frederik Warburg"],"pdf_url":"https://arxiv.org/pdf/2308.16900v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16898v1","updated":"2023-08-31T17:57:50Z","published":"2023-08-31T17:57:50Z","title":"Transformers as Support Vector Machines","summary":" Since its inception in \"Attention Is All You Need\", transformer architecture\nhas led to revolutionary advancements in NLP. The attention layer within the\ntransformer admits a sequence of input tokens $X$ and makes them interact\nthrough pairwise similarities computed as softmax$(XQK^\\top X^\\top)$, where\n$(K,Q)$ are the trainable key-query parameters. In this work, we establish a\nformal equivalence between the optimization geometry of self-attention and a\nhard-margin SVM problem that separates optimal input tokens from non-optimal\ntokens using linear constraints on the outer-products of token pairs. This\nformalism allows us to characterize the implicit bias of 1-layer transformers\noptimized with gradient descent: (1) Optimizing the attention layer with\nvanishing regularization, parameterized by $(K,Q)$, converges in direction to\nan SVM solution minimizing the nuclear norm of the combined parameter\n$W=KQ^\\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm\nobjective. We characterize this convergence, highlighting that it can occur\ntoward locally-optimal directions rather than global ones. (2) Complementing\nthis, we prove the local/global directional convergence of gradient descent\nunder suitable geometric conditions. Importantly, we show that\nover-parameterization catalyzes global convergence by ensuring the feasibility\nof the SVM problem and by guaranteeing a benign optimization landscape devoid\nof stationary points. (3) While our theory applies primarily to linear\nprediction heads, we propose a more general SVM equivalence that predicts the\nimplicit bias with nonlinear heads. Our findings are applicable to arbitrary\ndatasets and their validity is verified via experiments. We also introduce\nseveral open problems and research directions. We believe these findings\ninspire the interpretation of transformers as a hierarchy of SVMs that\nseparates and selects optimal tokens.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Christos Thrampoulidis","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2308.16898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16896v1","updated":"2023-08-31T17:57:17Z","published":"2023-08-31T17:57:17Z","title":"PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic\n Occupancy Prediction","summary":" Semantic segmentation in autonomous driving has been undergoing an evolution\nfrom sparse point segmentation to dense voxel segmentation, where the objective\nis to predict the semantic occupancy of each voxel in the concerned 3D space.\nThe dense nature of the prediction space has rendered existing efficient\n2D-projection-based methods (e.g., bird's eye view, range view, etc.)\nineffective, as they can only describe a subspace of the 3D scene. To address\nthis, we propose a cylindrical tri-perspective view to represent point clouds\neffectively and comprehensively and a PointOcc model to process them\nefficiently. Considering the distance distribution of LiDAR point clouds, we\nconstruct the tri-perspective view in the cylindrical coordinate system for\nmore fine-grained modeling of nearer areas. We employ spatial group pooling to\nmaintain structural details during projection and adopt 2D backbones to\nefficiently process each TPV plane. Finally, we obtain the features of each\npoint by aggregating its projected features on each of the processed TPV planes\nwithout the need for any post-processing. Extensive experiments on both 3D\noccupancy prediction and LiDAR segmentation benchmarks demonstrate that the\nproposed PointOcc achieves state-of-the-art performance with much faster speed.\nSpecifically, despite only using LiDAR, PointOcc significantly outperforms all\nother methods, including multi-modal methods, with a large margin on the\nOpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc.\n","authors":["Sicheng Zuo","Wenzhao Zheng","Yuanhui Huang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2308.16896v1.pdf","comment":"Code is available at https://github.com/wzzheng/PointOcc"},{"id":"http://arxiv.org/abs/2308.16893v1","updated":"2023-08-31T17:56:13Z","published":"2023-08-31T17:56:13Z","title":"Language-Conditioned Path Planning","summary":" Contact is at the core of robotic manipulation. At times, it is desired (e.g.\nmanipulation and grasping), and at times, it is harmful (e.g. when avoiding\nobstacles). However, traditional path planning algorithms focus solely on\ncollision-free paths, limiting their applicability in contact-rich tasks. To\naddress this limitation, we propose the domain of Language-Conditioned Path\nPlanning, where contact-awareness is incorporated into the path planning\nproblem. As a first step in this domain, we propose Language-Conditioned\nCollision Functions (LACO) a novel approach that learns a collision function\nusing only a single-view image, language prompt, and robot configuration. LACO\npredicts collisions between the robot and the environment, enabling flexible,\nconditional path planning without the need for manual object annotations, point\ncloud data, or ground-truth object meshes. In both simulation and the real\nworld, we demonstrate that LACO can facilitate complex, nuanced path plans that\nallow for interaction with objects that are safe to collide, rather than\nprohibiting any collision.\n","authors":["Amber Xie","Youngwoon Lee","Pieter Abbeel","Stephen James"],"pdf_url":"https://arxiv.org/pdf/2308.16893v1.pdf","comment":"Conference on Robot Learning, 2023"},{"id":"http://arxiv.org/abs/2308.16891v1","updated":"2023-08-31T17:52:10Z","published":"2023-08-31T17:52:10Z","title":"GNFactor: Multi-Task Real Robot Learning with Generalizable Neural\n Feature Fields","summary":" It is a long-standing problem in robotics to develop agents capable of\nexecuting diverse manipulation tasks from visual observations in unstructured\nreal-world environments. To achieve this goal, the robot needs to have a\ncomprehensive understanding of the 3D structure and semantics of the scene. In\nthis work, we present $\\textbf{GNFactor}$, a visual behavior cloning agent for\nmulti-task robotic manipulation with $\\textbf{G}$eneralizable $\\textbf{N}$eural\nfeature $\\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural\nfield (GNF) as a reconstruction module and a Perceiver Transformer as a\ndecision-making module, leveraging a shared deep 3D voxel representation. To\nincorporate semantics in 3D, the reconstruction module utilizes a\nvision-language foundation model ($\\textit{e.g.}$, Stable Diffusion) to distill\nrich semantic information into the deep 3D voxel. We evaluate GNFactor on 3\nreal robot tasks and perform detailed ablations on 10 RLBench tasks with a\nlimited number of demonstrations. We observe a substantial improvement of\nGNFactor over current state-of-the-art methods in seen and unseen tasks,\ndemonstrating the strong generalization ability of GNFactor. Our project\nwebsite is https://yanjieze.com/GNFactor/ .\n","authors":["Yanjie Ze","Ge Yan","Yueh-Hua Wu","Annabella Macaluso","Yuying Ge","Jianglong Ye","Nicklas Hansen","Li Erran Li","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16891v1.pdf","comment":"CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/"},{"id":"http://arxiv.org/abs/2212.02611v2","updated":"2023-08-31T17:51:08Z","published":"2022-12-05T21:52:12Z","title":"StyleGAN as a Utility-Preserving Face De-identification Method","summary":" Face de-identification methods have been proposed to preserve users' privacy\nby obscuring their faces. These methods, however, can degrade the quality of\nphotos, and they usually do not preserve the utility of faces, i.e., their age,\ngender, pose, and facial expression. Recently, GANs, such as StyleGAN, have\nbeen proposed, which generate realistic, high-quality imaginary faces. In this\npaper, we investigate the use of StyleGAN in generating de-identified faces\nthrough style mixing. We examined this de-identification method for preserving\nutility and privacy by implementing several face detection, verification, and\nidentification attacks and conducting a user study. The results from our\nextensive experiments, human evaluation, and comparison with two\nstate-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN\nperforms on par or better than these methods, preserving users' privacy and\nimages' utility. In particular, the results of the machine learning-based\nexperiments show that StyleGAN0-4 preserves utility better than CIAGAN and\nDeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves\nutility at the same level while providing more privacy. In this paper, for the\nfirst time, we also performed a carefully designed user study to examine both\nprivacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well\nas CIAGAN and DeepPrivacy from the human observers' perspectives. Our\nstatistical tests showed that participants tend to verify and identify\nStyleGAN0-5 images more easily than DeepPrivacy images. All the methods but\nStyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding\nutility, as expected, StyleGAN0-5 performed significantly better in preserving\nsome attributes. Among all methods, on average, participants believe gender has\nbeen preserved the most while naturalness has been preserved the least.\n","authors":["Seyyed Mohammad Sadegh Moosavi Khorzooghi","Shirin Nilizadeh"],"pdf_url":"https://arxiv.org/pdf/2212.02611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16889v1","updated":"2023-08-31T17:50:54Z","published":"2023-08-31T17:50:54Z","title":"Federated Learning in UAV-Enhanced Networks: Joint Coverage and\n Convergence Time Optimization","summary":" Federated learning (FL) involves several devices that collaboratively train a\nshared model without transferring their local data. FL reduces the\ncommunication overhead, making it a promising learning method in UAV-enhanced\nwireless networks with scarce energy resources. Despite the potential,\nimplementing FL in UAV-enhanced networks is challenging, as conventional UAV\nplacement methods that maximize coverage increase the FL delay significantly.\nMoreover, the uncertainty and lack of a priori information about crucial\nvariables, such as channel quality, exacerbate the problem. In this paper, we\nfirst analyze the statistical characteristics of a UAV-enhanced wireless sensor\nnetwork (WSN) with energy harvesting. We then develop a model and solution\nbased on the multi-objective multi-armed bandit theory to maximize the network\ncoverage while minimizing the FL delay. Besides, we propose another solution\nthat is particularly useful with large action sets and strict energy\nconstraints at the UAVs. Our proposal uses a scalarized best-arm identification\nalgorithm to find the optimal arms that maximize the ratio of the expected\nreward to the expected energy cost by sequentially eliminating one or more arms\nin each round. Then, we derive the upper bound on the error probability of our\nmulti-objective and cost-aware algorithm. Numerical results show the\neffectiveness of our approach.\n","authors":["Mariam Yahya","Setareh Maghsudi","Slawomir Stanczak"],"pdf_url":"https://arxiv.org/pdf/2308.16889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16886v1","updated":"2023-08-31T17:45:34Z","published":"2023-08-31T17:45:34Z","title":"Prediction of Diblock Copolymer Morphology via Machine Learning","summary":" A machine learning approach is presented to accelerate the computation of\nblock polymer morphology evolution for large domains over long timescales. The\nstrategy exploits the separation of characteristic times between coarse-grained\nparticle evolution on the monomer scale and slow morphological evolution over\nmesoscopic scales. In contrast to empirical continuum models, the proposed\napproach learns stochastically driven defect annihilation processes directly\nfrom particle-based simulations. A UNet architecture that respects different\nboundary conditions is adopted, thereby allowing periodic and fixed substrate\nboundary conditions of arbitrary shape. Physical concepts are also introduced\nvia the loss function and symmetries are incorporated via data augmentation.\nThe model is validated using three different use cases. Explainable artificial\nintelligence methods are applied to visualize the morphology evolution over\ntime. This approach enables the generation of large system sizes and long\ntrajectories to investigate defect densities and their evolution under\ndifferent types of confinement. As an application, we demonstrate the\nimportance of accessing late-stage morphologies for understanding particle\ndiffusion inside a single block. This work has implications for directed\nself-assembly and materials design in micro-electronics, battery materials, and\nmembranes.\n","authors":["Hyun Park","Boyuan Yu","Juhae Park","Ge Sun","Emad Tajkhorshid","Juan J. de Pablo","Ludwig Schneider"],"pdf_url":"https://arxiv.org/pdf/2308.16886v1.pdf","comment":"51 page, 11 Figures and 5 figures in the SI"},{"id":"http://arxiv.org/abs/2308.16884v1","updated":"2023-08-31T17:43:08Z","published":"2023-08-31T17:43:08Z","title":"The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122\n Language Variants","summary":" We present Belebele, a multiple-choice machine reading comprehension (MRC)\ndataset spanning 122 language variants. Significantly expanding the language\ncoverage of natural language understanding (NLU) benchmarks, this dataset\nenables the evaluation of text models in high-, medium-, and low-resource\nlanguages. Each question is based on a short passage from the Flores-200\ndataset and has four multiple-choice answers. The questions were carefully\ncurated to discriminate between models with different levels of general\nlanguage comprehension. The English dataset on its own proves difficult enough\nto challenge state-of-the-art language models. Being fully parallel, this\ndataset enables direct comparison of model performance across all languages. We\nuse this dataset to evaluate the capabilities of multilingual masked language\nmodels (MLMs) and large language models (LLMs). We present extensive results\nand find that despite significant cross-lingual transfer in English-centric\nLLMs, much smaller MLMs pretrained on balanced multilingual data still\nunderstand far more languages. We also observe that larger vocabulary size and\nconscious vocabulary construction correlate with better performance on\nlow-resource languages. Overall, Belebele opens up new avenues for evaluating\nand analyzing the multilingual capabilities of NLP systems.\n","authors":["Lucas Bandarkar","Davis Liang","Benjamin Muller","Mikel Artetxe","Satya Narayan Shukla","Donald Husa","Naman Goyal","Abhinandan Krishnan","Luke Zettlemoyer","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2308.16884v1.pdf","comment":"27 pages, 13 figures"},{"id":"http://arxiv.org/abs/2209.03450v2","updated":"2023-08-31T17:38:19Z","published":"2022-09-07T20:11:17Z","title":"Seeking Interpretability and Explainability in Binary Activated Neural\n Networks","summary":" We study the use of binary activated neural networks as interpretable and\nexplainable predictors in the context of regression tasks on tabular data; more\nspecifically, we provide guarantees on their expressiveness, present an\napproach based on the efficient computation of SHAP values for quantifying the\nrelative importance of the features, hidden neurons and even weights. As the\nmodel's simplicity is instrumental in achieving interpretability, we propose a\ngreedy algorithm for building compact binary activated networks. This approach\ndoesn't need to fix an architecture for the network in advance: it is built one\nlayer at a time, one neuron at a time, leading to predictors that aren't\nneedlessly complex for a given task.\n","authors":["Benjamin Leblanc","Pascal Germain"],"pdf_url":"https://arxiv.org/pdf/2209.03450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.02373v2","updated":"2023-08-31T17:12:16Z","published":"2022-10-05T16:30:35Z","title":"Dynamical systems' based neural networks","summary":" Neural networks have gained much interest because of their effectiveness in\nmany applications. However, their mathematical properties are generally not\nwell understood. If there is some underlying geometric structure inherent to\nthe data or to the function to approximate, it is often desirable to take this\ninto account in the design of the neural network. In this work, we start with a\nnon-autonomous ODE and build neural networks using a suitable,\nstructure-preserving, numerical time-discretisation. The structure of the\nneural network is then inferred from the properties of the ODE vector field.\nBesides injecting more structure into the network architectures, this modelling\nprocedure allows a better theoretical understanding of their behaviour. We\npresent two universal approximation results and demonstrate how to impose some\nparticular properties on the neural networks. A particular focus is on\n1-Lipschitz architectures including layers that are not 1-Lipschitz. These\nnetworks are expressive and robust against adversarial attacks, as shown for\nthe CIFAR-10 and CIFAR-100 datasets.\n","authors":["Elena Celledoni","Davide Murari","Brynjulf Owren","Carola-Bibiane Schönlieb","Ferdia Sherry"],"pdf_url":"https://arxiv.org/pdf/2210.02373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16859v1","updated":"2023-08-31T17:03:34Z","published":"2023-08-31T17:03:34Z","title":"Information Theoretically Optimal Sample Complexity of Learning\n Dynamical Directed Acyclic Graphs","summary":" In this article, the optimal sample complexity of learning the underlying\ninteraction/dependencies of a Linear Dynamical System (LDS) over a Directed\nAcyclic Graph (DAG) is studied. The sample complexity of learning a DAG's\nstructure is well-studied for static systems, where the samples of nodal states\nare independent and identically distributed (i.i.d.). However, such a study is\nless explored for DAGs with dynamical systems, where the nodal states are\ntemporally correlated. We call such a DAG underlying an LDS as \\emph{dynamical}\nDAG (DDAG). In particular, we consider a DDAG where the nodal dynamics are\ndriven by unobserved exogenous noise sources that are wide-sense stationary\n(WSS) in time but are mutually uncorrelated, and have the same {power spectral\ndensity (PSD)}. Inspired by the static settings, a metric and an algorithm\nbased on the PSD matrix of the observed time series are proposed to reconstruct\nthe DDAG. The equal noise PSD assumption can be relaxed such that\nidentifiability conditions for DDAG reconstruction are not violated. For the\nLDS with WSS (sub) Gaussian exogenous noise sources, it is shown that the\noptimal sample complexity (or length of state trajectory) needed to learn the\nDDAG is $n=\\Theta(q\\log(p/q))$, where $p$ is the number of nodes and $q$ is the\nmaximum number of parents per node. To prove the sample complexity upper bound,\na concentration bound for the PSD estimation is derived, under two different\nsampling strategies. A matching min-max lower bound using generalized Fano's\ninequality also is provided, thus showing the order optimality of the proposed\nalgorithm.\n","authors":["Mishfad Shaikh Veedu","Deepjyoti Deka","Murti V. Salapaka"],"pdf_url":"https://arxiv.org/pdf/2308.16859v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2308.16858v1","updated":"2023-08-31T17:03:16Z","published":"2023-08-31T17:03:16Z","title":"Majorization-Minimization for sparse SVMs","summary":" Several decades ago, Support Vector Machines (SVMs) were introduced for\nperforming binary classification tasks, under a supervised framework. Nowadays,\nthey often outperform other supervised methods and remain one of the most\npopular approaches in the machine learning arena. In this work, we investigate\nthe training of SVMs through a smooth sparse-promoting-regularized squared\nhinge loss minimization. This choice paves the way to the application of quick\ntraining methods built on majorization-minimization approaches, benefiting from\nthe Lipschitz differentiabililty of the loss function. Moreover, the proposed\napproach allows us to handle sparsity-preserving regularizers promoting the\nselection of the most significant features, so enhancing the performance.\nNumerical tests and comparisons conducted on three different datasets\ndemonstrate the good performance of the proposed methodology in terms of\nqualitative metrics (accuracy, precision, recall, and F 1 score) as well as\ncomputational cost.\n","authors":["Alessandro Benfenati","Emilie Chouzenoux","Giorgia Franchini","Salla Latva-Aijo","Dominik Narnhofer","Jean-Christophe Pesquet","Sebastian J. Scott","Mahsa Yousefi"],"pdf_url":"https://arxiv.org/pdf/2308.16858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14172v2","updated":"2023-08-31T16:57:35Z","published":"2023-08-27T18:28:58Z","title":"Hypergraph Structure Inference From Data Under Smoothness Prior","summary":" Hypergraphs are important for processing data with higher-order relationships\ninvolving more than two entities. In scenarios where explicit hypergraphs are\nnot readily available, it is desirable to infer a meaningful hypergraph\nstructure from the node features to capture the intrinsic relations within the\ndata. However, existing methods either adopt simple pre-defined rules that fail\nto precisely capture the distribution of the potential hypergraph structure, or\nlearn a mapping between hypergraph structures and node features but require a\nlarge amount of labelled data, i.e., pre-existing hypergraph structures, for\ntraining. Both restrict their applications in practical scenarios. To fill this\ngap, we propose a novel smoothness prior that enables us to design a method to\ninfer the probability for each potential hyperedge without labelled data as\nsupervision. The proposed prior indicates features of nodes in a hyperedge are\nhighly correlated by the features of the hyperedge containing them. We use this\nprior to derive the relation between the hypergraph structure and the node\nfeatures via probabilistic modelling. This allows us to develop an unsupervised\ninference method to estimate the probability for each potential hyperedge via\nsolving an optimisation problem that has an analytical solution. Experiments on\nboth synthetic and real-world data demonstrate that our method can learn\nmeaningful hypergraph structures from data more efficiently than existing\nhypergraph structure inference methods.\n","authors":["Bohan Tang","Siheng Chen","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2308.14172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15034v2","updated":"2023-08-31T16:37:28Z","published":"2023-07-27T17:42:06Z","title":"Speeding up Fourier Neural Operators via Mixed Precision","summary":" The Fourier neural operator (FNO) is a powerful technique for learning\nsurrogate maps for partial differential equation (PDE) solution operators. For\nmany real-world applications, which often require high-resolution data points,\ntraining time and memory usage are significant bottlenecks. While there are\nmixed-precision training techniques for standard neural networks, those work\nfor real-valued datatypes on finite dimensions and therefore cannot be directly\napplied to FNO, which crucially operates in the (complex-valued) Fourier domain\nand in function spaces. On the other hand, since the Fourier transform is\nalready an approximation (due to discretization error), we do not need to\nperform the operation at full precision. In this work, we (i) profile memory\nand runtime for FNO with full and mixed-precision training, (ii) conduct a\nstudy on the numerical stability of mixed-precision training of FNO, and (iii)\ndevise a training routine which substantially decreases training time and\nmemory usage (up to 34%), with little or no reduction in accuracy, on the\nNavier-Stokes and Darcy flow equations. Combined with the recently proposed\ntensorized FNO (Kossaifi et al., 2023), the resulting model has far better\nperformance while also being significantly faster than the original FNO.\n","authors":["Colin White","Renbo Tu","Jean Kossaifi","Gennady Pekhimenko","Kamyar Azizzadenesheli","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2307.15034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00752v3","updated":"2023-08-31T16:28:50Z","published":"2023-01-02T16:51:40Z","title":"Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave\n Communications","summary":" This study demonstrates the feasibility of point cloud-based proactive link\nquality prediction for millimeter-wave (mmWave) communications. Previous\nstudies have proposed machine learning-based methods to predict received signal\nstrength for future time periods using time series of depth images to mitigate\nthe line-of-sight (LOS) path blockage by pedestrians in mmWave communication.\nHowever, these image-based methods have limited applicability due to privacy\nconcerns as camera images may contain sensitive information. This study\nproposes a point cloud-based method for mmWave link quality prediction and\ndemonstrates its feasibility through experiments. Point clouds represent\nthree-dimensional (3D) spaces as a set of points and are sparser and less\nlikely to contain sensitive information than camera images. Additionally, point\nclouds provide 3D position and motion information, which is necessary for\nunderstanding the radio propagation environment involving pedestrians. This\nstudy designs the mmWave link quality prediction method and conducts realistic\nindoor experiments, where the link quality fluctuates significantly due to\nhuman blockage, using commercially available IEEE 802.11ad-based 60 GHz\nwireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light\ndetection and ranging (LiDAR) for point cloud acquisition. The experimental\nresults showed that our proposed method can predict future large attenuation of\nmmWave received signal strength and throughput induced by the LOS path blockage\nby pedestrians with comparable or superior accuracy to image-based prediction\nmethods. Hence, our point cloud-based method can serve as a viable alternative\nto image-based methods.\n","authors":["Shoki Ohta","Takayuki Nishio","Riichi Kudo","Kahoko Takahashi","Hisashi Nagata"],"pdf_url":"https://arxiv.org/pdf/2301.00752v3.pdf","comment":"Submitted to IEEE Transactions on Machine Learning in Communications\n and Networking"},{"id":"http://arxiv.org/abs/2308.16848v1","updated":"2023-08-31T16:27:08Z","published":"2023-08-31T16:27:08Z","title":"Natural Quantum Monte Carlo Computation of Excited States","summary":" We present a variational Monte Carlo algorithm for estimating the lowest\nexcited states of a quantum system which is a natural generalization of the\nestimation of ground states. The method has no free parameters and requires no\nexplicit orthogonalization of the different states, instead transforming the\nproblem of finding excited states of a given system into that of finding the\nground state of an expanded system. Expected values of arbitrary observables\ncan be calculated, including off-diagonal expectations between different states\nsuch as the transition dipole moment. Although the method is entirely general,\nit works particularly well in conjunction with recent work on using neural\nnetworks as variational Ansatze for many-electron systems, and we show that by\ncombining this method with the FermiNet and Psiformer Ansatze we can accurately\nrecover vertical excitation energies and oscillator strengths on molecules as\nlarge as benzene. Beyond the examples on molecules presented here, we expect\nthis technique will be of great interest for applications of variational\nquantum Monte Carlo to atomic, nuclear and condensed matter physics.\n","authors":["David Pfau","Simon Axelrod","Halvard Sutterud","Ingrid von Glehn","James S. Spencer"],"pdf_url":"https://arxiv.org/pdf/2308.16848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16847v1","updated":"2023-08-31T16:26:17Z","published":"2023-08-31T16:26:17Z","title":"Diffusion Models for Interferometric Satellite Aperture Radar","summary":" Probabilistic Diffusion Models (PDMs) have recently emerged as a very\npromising class of generative models, achieving high performance in natural\nimage generation. However, their performance relative to non-natural images,\nlike radar-based satellite data, remains largely unknown. Generating large\namounts of synthetic (and especially labelled) satellite data is crucial to\nimplement deep-learning approaches for the processing and analysis of\n(interferometric) satellite aperture radar data. Here, we leverage PDMs to\ngenerate several radar-based satellite image datasets. We show that PDMs\nsucceed in generating images with complex and realistic structures, but that\nsampling time remains an issue. Indeed, accelerated sampling strategies, which\nwork well on simple image datasets like MNIST, fail on our radar datasets. We\nprovide a simple and versatile open-source\nhttps://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and\nevaluate PDMs using any dataset on a single GPU.\n","authors":["Alexandre Tuel","Thomas Kerdreux","Claudia Hulbert","Bertrand Rouet-Leduc"],"pdf_url":"https://arxiv.org/pdf/2308.16847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08761v3","updated":"2023-08-31T16:21:10Z","published":"2023-02-17T08:56:07Z","title":"Metropolitan Segment Traffic Speeds from Massive Floating Car Data in 10\n Cities","summary":" Traffic analysis is crucial for urban operations and planning, while the\navailability of dense urban traffic data beyond loop detectors is still scarce.\nWe present a large-scale floating vehicle dataset of per-street segment traffic\ninformation, Metropolitan Segment Traffic Speeds from Massive Floating Car Data\nin 10 Cities (MeTS-10), available for 10 global cities with a 15-minute\nresolution for collection periods ranging between 108 and 361 days in 2019-2021\nand covering more than 1500 square kilometers per metropolitan area. MeTS-10\nfeatures traffic speed information at all street levels from main arterials to\nlocal streets for Antwerp, Bangkok, Barcelona, Berlin, Chicago, Istanbul,\nLondon, Madrid, Melbourne and Moscow. The dataset leverages the\nindustrial-scale floating vehicle Traffic4cast data with speeds and vehicle\ncounts provided in a privacy-preserving spatio-temporal aggregation. We detail\nthe efficient matching approach mapping the data to the OpenStreetMap road\ngraph. We evaluate the dataset by comparing it with publicly available\nstationary vehicle detector data (for Berlin, London, and Madrid) and the Uber\ntraffic speed dataset (for Barcelona, Berlin, and London). The comparison\nhighlights the differences across datasets in spatio-temporal coverage and\nvariations in the reported traffic caused by the binning method. MeTS-10\nenables novel, city-wide analysis of mobility and traffic patterns for ten\nmajor world cities, overcoming current limitations of spatially sparse vehicle\ndetector data. The large spatial and temporal coverage offers an opportunity\nfor joining the MeTS-10 with other datasets, such as traffic surveys in traffic\nplanning studies or vehicle detector data in traffic control settings.\n","authors":["Moritz Neun","Christian Eichenberger","Yanan Xin","Cheng Fu","Nina Wiedemann","Henry Martin","Martin Tomko","Lukas Ambühl","Luca Hermes","Michael Kopp"],"pdf_url":"https://arxiv.org/pdf/2302.08761v3.pdf","comment":"Accepted by IEEE Transactions on Intelligent Transportation Systems\n (T-ITS), DOI: https://doi.org/10.1109/TITS.2023.3291737"},{"id":"http://arxiv.org/abs/2306.08149v3","updated":"2023-08-31T16:14:05Z","published":"2023-06-13T21:47:30Z","title":"Neural Mixed Effects for Nonlinear Personalized Predictions","summary":" Personalized prediction is a machine learning approach that predicts a\nperson's future observations based on their past labeled observations and is\ntypically used for sequential tasks, e.g., to predict daily mood ratings. When\nmaking personalized predictions, a model can combine two types of trends: (a)\ntrends shared across people, i.e., person-generic trends, such as being happier\non weekends, and (b) unique trends for each person, i.e., person-specific\ntrends, such as a stressful weekly meeting. Mixed effect models are popular\nstatistical models to study both trends by combining person-generic and\nperson-specific parameters. Though linear mixed effect models are gaining\npopularity in machine learning by integrating them with neural networks, these\nintegrations are currently limited to linear person-specific parameters: ruling\nout nonlinear person-specific trends. In this paper, we propose Neural Mixed\nEffect (NME) models to optimize nonlinear person-specific parameters anywhere\nin a neural network in a scalable manner. NME combines the efficiency of neural\nnetwork optimization with nonlinear mixed effects modeling. Empirically, we\nobserve that NME improves performance across six unimodal and multimodal\ndatasets, including a smartphone dataset to predict daily mood and a\nmother-adolescent dataset to predict affective state sequences where half the\nmothers experience at least moderate symptoms of depression. Furthermore, we\nevaluate NME for two model architectures, including for neural conditional\nrandom fields (CRF) to predict affective state sequences where the CRF learns\nnonlinear person-specific temporal transitions between affective states.\nAnalysis of these person-specific transitions on the mother-adolescent dataset\nshows interpretable trends related to the mother's depression symptoms.\n","authors":["Torsten Wörtwein","Nicholas Allen","Lisa B. Sheeber","Randy P. Auerbach","Jeffrey F. Cohn","Louis-Philippe Morency"],"pdf_url":"https://arxiv.org/pdf/2306.08149v3.pdf","comment":"camera-ready version"},{"id":"http://arxiv.org/abs/2308.16835v1","updated":"2023-08-31T16:10:22Z","published":"2023-08-31T16:10:22Z","title":"FedDD: Toward Communication-efficient Federated Learning with\n Differential Parameter Dropout","summary":" Federated Learning (FL) requires frequent exchange of model parameters, which\nleads to long communication delay, especially when the network environments of\nclients vary greatly. Moreover, the parameter server needs to wait for the\nslowest client (i.e., straggler, which may have the largest model size, lowest\ncomputing capability or worst network condition) to upload parameters, which\nmay significantly degrade the communication efficiency. Commonly-used client\nselection methods such as partial client selection would lead to the waste of\ncomputing resources and weaken the generalization of the global model. To\ntackle this problem, along a different line, in this paper, we advocate the\napproach of model parameter dropout instead of client selection, and\naccordingly propose a novel framework of Federated learning scheme with\nDifferential parameter Dropout (FedDD). FedDD consists of two key modules:\ndropout rate allocation and uploaded parameter selection, which will optimize\nthe model parameter uploading ratios tailored to different clients'\nheterogeneous conditions and also select the proper set of important model\nparameters for uploading subject to clients' dropout rate constraints.\nSpecifically, the dropout rate allocation is formulated as a convex\noptimization problem, taking system heterogeneity, data heterogeneity, and\nmodel heterogeneity among clients into consideration. The uploaded parameter\nselection strategy prioritizes on eliciting important parameters for uploading\nto speedup convergence. Furthermore, we theoretically analyze the convergence\nof the proposed FedDD scheme. Extensive performance evaluations demonstrate\nthat the proposed FedDD scheme can achieve outstanding performances in both\ncommunication efficiency and model convergence, and also possesses a strong\ngeneralization capability to data of rare classes.\n","authors":["Zhiying Feng","Xu Chen","Qiong Wu","Wen Wu","Xiaoxi Zhang","Qianyi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.16835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11737v2","updated":"2023-08-31T15:57:37Z","published":"2023-06-14T18:27:39Z","title":"Neural ShDF: Reviving an Efficient and Consistent Mesh Segmentation\n Method","summary":" Partitioning a polygonal mesh into meaningful parts can be challenging. Many\napplications require decomposing such structures for further processing in\ncomputer graphics. In the last decade, several methods were proposed to tackle\nthis problem, at the cost of intensive computational times. Recently, machine\nlearning has proven to be effective for the segmentation task on 3D structures.\nNevertheless, these state-of-the-art methods are often hardly generalizable and\nrequire dividing the learned model into several specific classes of objects to\navoid overfitting. We present a data-driven approach leveraging deep learning\nto encode a mapping function prior to mesh segmentation for multiple\napplications. Our network reproduces a neighborhood map using our knowledge of\nthe \\textsl{Shape Diameter Function} (SDF) method using similarities among\nvertex neighborhoods. Our approach is resolution-agnostic as we downsample the\ninput meshes and query the full-resolution structure solely for neighborhood\ncontributions. Using our predicted SDF values, we can inject the resulting\nstructure into a graph-cut algorithm to generate an efficient and robust mesh\nsegmentation while considerably reducing the required computation times.\n","authors":["Bruno Roy"],"pdf_url":"https://arxiv.org/pdf/2306.11737v2.pdf","comment":"9 pages, 13 figures, and 3 tables. Short paper and poster published\n and presented at SIGGRAPH 2023"},{"id":"http://arxiv.org/abs/2308.16822v1","updated":"2023-08-31T15:52:35Z","published":"2023-08-31T15:52:35Z","title":"Latent Variable Multi-output Gaussian Processes for Hierarchical\n Datasets","summary":" Multi-output Gaussian processes (MOGPs) have been introduced to deal with\nmultiple tasks by exploiting the correlations between different outputs.\nGenerally, MOGPs models assume a flat correlation structure between the\noutputs. However, such a formulation does not account for more elaborate\nrelationships, for instance, if several replicates were observed for each\noutput (which is a typical setting in biological experiments). This paper\nproposes an extension of MOGPs for hierarchical datasets (i.e. datasets for\nwhich the relationships between observations can be represented within a tree\nstructure). Our model defines a tailored kernel function accounting for\nhierarchical structures in the data to capture different levels of correlations\nwhile leveraging the introduction of latent variables to express the underlying\ndependencies between outputs through a dedicated kernel. This latter feature is\nexpected to significantly improve scalability as the number of tasks increases.\nAn extensive experimental study involving both synthetic and real-world data\nfrom genomics and motion capture is proposed to support our claims.\n","authors":["Chunchao Ma","Arthur Leroy","Mauricio Alvarez"],"pdf_url":"https://arxiv.org/pdf/2308.16822v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.16818v1","updated":"2023-08-31T15:49:21Z","published":"2023-08-31T15:49:21Z","title":"Irregular Traffic Time Series Forecasting Based on Asynchronous\n Spatio-Temporal Graph Convolutional Network","summary":" Accurate traffic forecasting at intersections governed by intelligent traffic\nsignals is critical for the advancement of an effective intelligent traffic\nsignal control system. However, due to the irregular traffic time series\nproduced by intelligent intersections, the traffic forecasting task becomes\nmuch more intractable and imposes three major new challenges: 1) asynchronous\nspatial dependency, 2) irregular temporal dependency among traffic data, and 3)\nvariable-length sequence to be predicted, which severely impede the performance\nof current traffic forecasting methods. To this end, we propose an Asynchronous\nSpatio-tEmporal graph convolutional nEtwoRk (ASeer) to predict the traffic\nstates of the lanes entering intelligent intersections in a future time window.\nSpecifically, by linking lanes via a traffic diffusion graph, we first propose\nan Asynchronous Graph Diffusion Network to model the asynchronous spatial\ndependency between the time-misaligned traffic state measurements of lanes.\nAfter that, to capture the temporal dependency within irregular traffic state\nsequence, a learnable personalized time encoding is devised to embed the\ncontinuous time for each lane. Then we propose a Transformable Time-aware\nConvolution Network that learns meta-filters to derive time-aware convolution\nfilters with transformable filter sizes for efficient temporal convolution on\nthe irregular sequence. Furthermore, a Semi-Autoregressive Prediction Network\nconsisting of a state evolution unit and a semiautoregressive predictor is\ndesigned to effectively and efficiently predict variable-length traffic state\nsequences. Extensive experiments on two real-world datasets demonstrate the\neffectiveness of ASeer in six metrics.\n","authors":["Weijia Zhang","Le Zhang","Jindong Han","Hao Liu","Jingbo Zhou","Yu Mei","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.16818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09312v2","updated":"2023-08-31T15:32:01Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks, such as Reddit discussions. In contrast to traditional comment-only\nmethods, our approach to labelling a comment as hate speech involves a holistic\nanalysis of text and images grounded in the discussion context. This is done by\nleveraging graph transformers to capture the contextual relationships in the\nentire discussion surrounding a comment and grounding the interwoven fusion\nlayers that combine individual comments' text and image embeddings instead of\nprocessing modalities separately. We compare the performance of our model to\nbaselines that only process individual comments and conduct extensive ablation\nstudies. To evaluate our work, we present a new dataset, HatefulDiscussions,\ncomprising complete multi-modal discussions from multiple online communities on\nReddit. We conclude with future work for multimodal solutions to deliver social\nvalue in online contexts, arguing that capturing a holistic view of a\nconversation significantly advances the effort to detect anti-social behaviour.\n","authors":["Liam Hebert","Gaurav Sahu","Yuxuan Guo","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2308.16800v1","updated":"2023-08-31T15:22:31Z","published":"2023-08-31T15:22:31Z","title":"Rank Collapse Causes Over-Smoothing and Over-Correlation in Graph Neural\n Networks","summary":" Our study reveals new theoretical insights into over-smoothing and feature\nover-correlation in deep graph neural networks. We show the prevalence of\ninvariant subspaces, demonstrating a fixed relative behavior that is unaffected\nby feature transformations. Our work clarifies recent observations related to\nconvergence to a constant state and a potential over-separation of node states,\nas the amplification of subspaces only depends on the spectrum of the\naggregation function. In linear scenarios, this leads to node representations\nbeing dominated by a low-dimensional subspace with an asymptotic convergence\nrate independent of the feature transformations. This causes a rank collapse of\nthe node representations, resulting in over-smoothing when smooth vectors span\nthis subspace, and over-correlation even when over-smoothing is avoided. Guided\nby our theory, we propose a sum of Kronecker products as a beneficial property\nthat can provably prevent over-smoothing, over-correlation, and rank collapse.\nWe empirically extend our insights to the non-linear case, demonstrating the\ninability of existing models to capture linearly independent features.\n","authors":["Andreas Roth","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2308.16800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14388v2","updated":"2023-08-31T15:13:28Z","published":"2023-08-28T08:07:57Z","title":"Biclustering Methods via Sparse Penalty","summary":" In this paper, we first reviewed several biclustering methods that are used\nto identify the most significant clusters in gene expression data. Here we\nmainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty\nnamed \"Prenet penalty\" which has been used only in factor analysis to gain\nsparsity. Then in the simulation study, we tried different types of generated\ndatasets (with different sparsity and dimension) and tried 1-layer\napproximation then for k-layers which shows the mixed Prenet penalty is very\neffective for non-overlapped data. Finally, we used some real gene expression\ndata to show the behavior of our methods.\n","authors":["Jiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14388v2.pdf","comment":"This research it still in progress and need to fix some issues"},{"id":"http://arxiv.org/abs/2106.14052v2","updated":"2023-08-31T15:10:45Z","published":"2021-06-26T16:05:44Z","title":"Combining Inductive and Deductive Reasoning for Query Answering over\n Incomplete Knowledge Graphs","summary":" Current methods for embedding-based query answering over incomplete Knowledge\nGraphs (KGs) only focus on inductive reasoning, i.e., predicting answers by\nlearning patterns from the data, and lack the complementary ability to do\ndeductive reasoning, which requires the application of domain knowledge to\ninfer further information. To address this shortcoming, we investigate the\nproblem of incorporating ontologies into embedding-based query answering models\nby defining the task of embedding-based ontology-mediated query answering. We\npropose various integration strategies into prominent representatives of\nembedding models that involve (1) different ontology-driven data augmentation\ntechniques and (2) adaptation of the loss function to enforce the ontology\naxioms. We design novel benchmarks for the considered task based on the LUBM\nand the NELL KGs and evaluate our methods on them. The achieved improvements in\nthe setting that requires both inductive and deductive reasoning are from 20%\nto 55% in HITS@3.\n","authors":["Medina Andresel","Trung-Kien Tran","Csaba Domokos","Pasquale Minervini","Daria Stepanova"],"pdf_url":"https://arxiv.org/pdf/2106.14052v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16789v1","updated":"2023-08-31T15:04:28Z","published":"2023-08-31T15:04:28Z","title":"Joint Semantic-Native Communication and Inference via Minimal Simplicial\n Structures","summary":" In this work, we study the problem of semantic communication and inference,\nin which a student agent (i.e. mobile device) queries a teacher agent (i.e.\ncloud sever) to generate higher-order data semantics living in a simplicial\ncomplex. Specifically, the teacher first maps its data into a k-order\nsimplicial complex and learns its high-order correlations. For effective\ncommunication and inference, the teacher seeks minimally sufficient and\ninvariant semantic structures prior to conveying information. These minimal\nsimplicial structures are found via judiciously removing simplices selected by\nthe Hodge Laplacians without compromising the inference query accuracy.\nSubsequently, the student locally runs its own set of queries based on a masked\nsimplicial convolutional autoencoder (SCAE) leveraging both local and remote\nteacher's knowledge. Numerical results corroborate the effectiveness of the\nproposed approach in terms of improving inference query accuracy under\ndifferent channel conditions and simplicial structures. Experiments on a\ncoauthorship dataset show that removing simplices by ranking the Laplacian\nvalues yields a 85% reduction in payload size without sacrificing accuracy.\nJoint semantic communication and inference by masked SCAE improves query\naccuracy by 25% compared to local student based query and 15% compared to\nremote teacher based query. Finally, incorporating channel semantics is shown\nto effectively improve inference accuracy, notably at low SNR values.\n","authors":["Qiyang Zhao","Hang Zou","Mehdi Bennis","Merouane Debbah","Ebtesam Almazrouei","Faouzi Bader"],"pdf_url":"https://arxiv.org/pdf/2308.16789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16781v1","updated":"2023-08-31T14:59:32Z","published":"2023-08-31T14:59:32Z","title":"StratMed: Relevance Stratification for Low-resource Medication\n Recommendation","summary":" With the growing imbalance between limited medical resources and escalating\ndemands, AI-based clinical tasks have become paramount. Medication\nrecommendation, as a sub-domain, aims to amalgamate longitudinal patient\nhistory with medical knowledge, assisting physicians in prescribing safer and\nmore accurate medication combinations. Existing methods overlook the inherent\nlong-tail distribution in medical data, lacking balanced representation between\nhead and tail data, which leads to sub-optimal model performance. To address\nthis challenge, we introduce StratMed, a model that incorporates an innovative\nrelevance stratification mechanism. It harmonizes discrepancies in data\nlong-tail distribution and strikes a balance between the safety and accuracy of\nmedication combinations. Specifically, we first construct a pre-training method\nusing deep learning networks to obtain entity representation. After that, we\ndesign a pyramid-like data stratification method to obtain more generalized\nentity relationships by reinforcing the features of unpopular entities. Based\non this relationship, we designed two graph structures to express medication\nprecision and safety at the same level to obtain visit representations.\nFinally, the patient's historical clinical information is fitted to generate\nmedication combinations for the current health condition. Experiments on the\nMIMIC-III dataset demonstrate that our method has outperformed current\nstate-of-the-art methods in four evaluation metrics (including safety and\naccuracy).\n","authors":["Xiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.16781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16775v1","updated":"2023-08-31T14:54:06Z","published":"2023-08-31T14:54:06Z","title":"Efficacy of Neural Prediction-Based NAS for Zero-Shot NAS Paradigm","summary":" In prediction-based Neural Architecture Search (NAS), performance indicators\nderived from graph convolutional networks have shown significant success. These\nindicators, achieved by representing feed-forward structures as component\ngraphs through one-hot encoding, face a limitation: their inability to evaluate\narchitecture performance across varying search spaces. In contrast, handcrafted\nperformance indicators (zero-shot NAS), which use the same architecture with\nrandom initialization, can generalize across multiple search spaces. Addressing\nthis limitation, we propose a novel approach for zero-shot NAS using deep\nlearning. Our method employs Fourier sum of sines encoding for convolutional\nkernels, enabling the construction of a computational feed-forward graph with a\nstructure similar to the architecture under evaluation. These encodings are\nlearnable and offer a comprehensive view of the architecture's topological\ninformation. An accompanying multi-layer perceptron (MLP) then ranks these\narchitectures based on their encodings. Experimental results show that our\napproach surpasses previous methods using graph convolutional networks in terms\nof correlation on the NAS-Bench-201 dataset and exhibits a higher convergence\nrate. Moreover, our extracted feature representation trained on each\nNAS-Benchmark is transferable to other NAS-Benchmarks, showing promising\ngeneralizability across multiple search spaces. The code is available at:\nhttps://github.com/minh1409/DFT-NPZS-NAS\n","authors":["Minh Le","Nhan Nguyen","Ngoc Hoang Luong"],"pdf_url":"https://arxiv.org/pdf/2308.16775v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.17670v2","updated":"2023-08-31T14:53:15Z","published":"2023-06-30T14:01:53Z","title":"Learning Delays in Spiking Neural Networks using Dilated Convolutions\n with Learnable Spacings","summary":" Spiking Neural Networks (SNNs) are a promising research direction for\nbuilding power-efficient information processing systems, especially for\ntemporal tasks such as speech recognition. In SNNs, delays refer to the time\nneeded for one spike to travel from one neuron to another. These delays matter\nbecause they influence the spike arrival times, and it is well-known that\nspiking neurons respond more strongly to coincident input spikes. More\nformally, it has been shown theoretically that plastic delays greatly increase\nthe expressivity in SNNs. Yet, efficient algorithms to learn these delays have\nbeen lacking. Here, we propose a new discrete-time algorithm that addresses\nthis issue in deep feedforward SNNs using backpropagation, in an offline\nmanner. To simulate delays between consecutive layers, we use 1D convolutions\nacross time. The kernels contain only a few non-zero weights - one per synapse\n- whose positions correspond to the delays. These positions are learned\ntogether with the weights using the recently proposed Dilated Convolution with\nLearnable Spacings (DCLS). We evaluated our method on three datasets: the\nSpiking Heidelberg Dataset (SHD), the Spiking Speech Commands (SSC) and its\nnon-spiking version Google Speech Commands v0.02 (GSC) benchmarks, which\nrequire detecting temporal patterns. We used feedforward SNNs with two or three\nhidden fully connected layers, and vanilla leaky integrate-and fire neurons. We\nshowed that fixed random delays help and that learning them helps even more.\nFurthermore, our method outperformed the state-of-the-art in the three datasets\nwithout using recurrent connections and with substantially fewer parameters.\nOur work demonstrates the potential of delay learning in developing accurate\nand precise models for temporal data processing. Our code is based on PyTorch /\nSpikingJelly and available at: https://github.com/Thvnvtos/SNN-delays\n","authors":["Ilyass Hammouamri","Ismail Khalfaoui-Hassani","Timothée Masquelier"],"pdf_url":"https://arxiv.org/pdf/2306.17670v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00049v3","updated":"2023-08-31T14:38:57Z","published":"2023-01-31T19:33:14Z","title":"Transformers Meet Directed Graphs","summary":" Transformers were originally proposed as a sequence-to-sequence model for\ntext but have become vital for a wide range of modalities, including images,\naudio, video, and undirected graphs. However, transformers for directed graphs\nare a surprisingly underexplored topic, despite their applicability to\nubiquitous domains, including source code and logic circuits. In this work, we\npropose two direction- and structure-aware positional encodings for directed\ngraphs: (1) the eigenvectors of the Magnetic Laplacian - a direction-aware\ngeneralization of the combinatorial Laplacian; (2) directional random walk\nencodings. Empirically, we show that the extra directionality information is\nuseful in various downstream tasks, including correctness testing of sorting\nnetworks and source code understanding. Together with a data-flow-centric graph\nconstruction, our model outperforms the prior state of the art on the Open\nGraph Benchmark Code2 relatively by 14.7%.\n","authors":["Simon Geisler","Yujia Li","Daniel Mankowitz","Ali Taylan Cemgil","Stephan Günnemann","Cosmin Paduraru"],"pdf_url":"https://arxiv.org/pdf/2302.00049v3.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2308.16759v1","updated":"2023-08-31T14:27:36Z","published":"2023-08-31T14:27:36Z","title":"Constructing Indoor Region-based Radio Map without Location Labels","summary":" Radio map construction requires a large amount of radio measurement data with\nlocation labels, which imposes a high deployment cost. This paper develops a\nregion-based radio map from received signal strength (RSS) measurements without\nlocation labels. The construction is based on a set of blindly collected RSS\nmeasurement data from a device that visits each region in an indoor area\nexactly once, where the footprints and timestamps are not recorded. The main\nchallenge is to cluster the RSS data and match clusters with the physical\nregions. Classical clustering algorithms fail to work as the RSS data naturally\nappears as non-clustered due to multipaths and noise. In this paper, a signal\nsubspace model with a sequential prior is constructed for the RSS data, and an\nintegrated segmentation and clustering algorithm is developed, which is shown\nto find the globally optimal solution in a special case. Furthermore, the\nclustered data is matched with the physical regions using a graph-based\napproach. Based on real measurements from an office space, the proposed scheme\nreduces the region localization error by roughly 50% compared to a weighted\ncentroid localization (WCL) baseline, and it even outperforms some supervised\nlocalization schemes, including k-nearest neighbor (KNN), support vector\nmachine (SVM), and deep neural network (DNN), which require labeled data for\ntraining.\n","authors":["Zheng Xing","Junting Chen"],"pdf_url":"https://arxiv.org/pdf/2308.16759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16754v1","updated":"2023-08-31T14:21:40Z","published":"2023-08-31T14:21:40Z","title":"Training Neural Networks Using Reproducing Kernel Space Interpolation\n and Model Reduction","summary":" We introduce and study the theory of training neural networks using\ninterpolation techniques from reproducing kernel Hilbert space theory. We\ngeneralize the method to Krein spaces, and show that widely-used neural network\narchitectures are subsets of reproducing kernel Krein spaces (RKKS). We study\nthe concept of \"associated Hilbert spaces\" of RKKS and develop techniques to\nimprove upon the expressivity of various activation functions. Next, using\nconcepts from the theory of functions of several complex variables, we prove a\ncomputationally applicable, multidimensional generalization of the celebrated\nAdamjan- Arov-Krein (AAK) theorem. The theorem yields a novel class of neural\nnetworks, called Prolongation Neural Networks (PNN). We demonstrate that, by\napplying the multidimensional AAK theorem to gain a PNN, one can gain\nperformance superior to both our interpolatory methods and current\nstate-of-the-art methods in noisy environments. We provide useful illustrations\nof our methods in practice.\n","authors":["Eric Arthur Werneburg"],"pdf_url":"https://arxiv.org/pdf/2308.16754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16752v1","updated":"2023-08-31T14:16:30Z","published":"2023-08-31T14:16:30Z","title":"Moreau Envelope ADMM for Decentralized Weakly Convex Optimization","summary":" This paper proposes a proximal variant of the alternating direction method of\nmultipliers (ADMM) for distributed optimization. Although the current versions\nof ADMM algorithm provide promising numerical results in producing solutions\nthat are close to optimal for many convex and non-convex optimization problems,\nit remains unclear if they can converge to a stationary point for weakly convex\nand locally non-smooth functions. Through our analysis using the Moreau\nenvelope function, we demonstrate that MADM can indeed converge to a stationary\npoint under mild conditions. Our analysis also includes computing the bounds on\nthe amount of change in the dual variable update step by relating the gradient\nof the Moreau envelope function to the proximal function. Furthermore, the\nresults of our numerical experiments indicate that our method is faster and\nmore robust than widely-used approaches.\n","authors":["Reza Mirzaeifard","Naveen K. D. Venkategowda","Alexander Jung","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2308.16752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16738v1","updated":"2023-08-31T13:54:57Z","published":"2023-08-31T13:54:57Z","title":"US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for\n Cervical Lymph Node Lesions Diagnoses in Ultrasound Images","summary":" Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph\nnode lesions. However, the diagnoses of these images largely hinge on the\nexpertise of medical practitioners, rendering the process susceptible to\nmisdiagnoses. Although rapidly developing deep learning has substantially\nimproved the diagnoses of diverse ultrasound images, there remains a\nconspicuous research gap concerning cervical lymph nodes. The objective of our\nwork is to accurately diagnose cervical lymph node lesions by leveraging a deep\nlearning model. To this end, we first collected 3392 images containing normal\nlymph nodes, benign lymph node lesions, malignant primary lymph node lesions,\nand malignant metastatic lymph node lesions. Given that ultrasound images are\ngenerated by the reflection and scattering of sound waves across varied bodily\ntissues, we proposed the Conv-FFT Block. It integrates convolutional operations\nwith the fast Fourier transform to more astutely model the images. Building\nupon this foundation, we designed a novel architecture, named US-SFNet. This\narchitecture not only discerns variances in ultrasound images from the spatial\ndomain but also adeptly captures microstructural alterations across various\nlesions in the frequency domain. To ascertain the potential of US-SFNet, we\nbenchmarked it against 12 popular architectures through five-fold\ncross-validation. The results show that US-SFNet is SOTA and can achieve 92.89%\naccuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity,\nrespectively.\n","authors":["Yubiao Yue","Jun Xue","Haihua Liang","Bingchun Luo","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.16738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16737v1","updated":"2023-08-31T13:54:37Z","published":"2023-08-31T13:54:37Z","title":"Robust Networked Federated Learning for Localization","summary":" This paper addresses the problem of localization, which is inherently\nnon-convex and non-smooth in a federated setting where the data is distributed\nacross a multitude of devices. Due to the decentralized nature of federated\nenvironments, distributed learning becomes essential for scalability and\nadaptability. Moreover, these environments are often plagued by outlier data,\nwhich presents substantial challenges to conventional methods, particularly in\nmaintaining estimation accuracy and ensuring algorithm convergence. To mitigate\nthese challenges, we propose a method that adopts an $L_1$-norm robust\nformulation within a distributed sub-gradient framework, explicitly designed to\nhandle these obstacles. Our approach addresses the problem in its original\nform, without resorting to iterative simplifications or approximations,\nresulting in enhanced computational efficiency and improved estimation\naccuracy. We demonstrate that our method converges to a stationary point,\nhighlighting its effectiveness and reliability. Through numerical simulations,\nwe confirm the superior performance of our approach, notably in outlier-rich\nenvironments, which surpasses existing state-of-the-art localization methods.\n","authors":["Reza Mirzaeifard","Naveen K. D. Venkategowda","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2308.16737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10283v2","updated":"2023-08-31T13:47:57Z","published":"2023-08-20T14:36:45Z","title":"Adaptive Uncertainty-Guided Model Selection for Data-Driven PDE\n Discovery","summary":" We propose a new parameter-adaptive uncertainty-penalized Bayesian\ninformation criterion (UBIC) to prioritize the parsimonious partial\ndifferential equation (PDE) that sufficiently governs noisy spatial-temporal\nobserved data with few reliable terms. Since the naive use of the BIC for model\nselection has been known to yield an undesirable overfitted PDE, the UBIC\npenalizes the found PDE not only by its complexity but also the quantified\nuncertainty, derived from the model supports' coefficient of variation in a\nprobabilistic view. We also introduce physics-informed neural network learning\nas a simulation-based approach to further validate the selected PDE flexibly\nagainst the other discovered PDE. Numerical results affirm the successful\napplication of the UBIC in identifying the true governing PDE. Additionally, we\nreveal an interesting effect of denoising the observed data on improving the\ntrade-off between the BIC score and model complexity. Code is available at\nhttps://github.com/Pongpisit-Thanasutives/UBIC.\n","authors":["Pongpisit Thanasutives","Takashi Morita","Masayuki Numao","Ken-ichi Fukui"],"pdf_url":"https://arxiv.org/pdf/2308.10283v2.pdf","comment":"17 pages, 15 figures"},{"id":"http://arxiv.org/abs/2308.16718v1","updated":"2023-08-31T13:37:28Z","published":"2023-08-31T13:37:28Z","title":"Robust Representation Learning for Unreliable Partial Label Learning","summary":" Partial Label Learning (PLL) is a type of weakly supervised learning where\neach training instance is assigned a set of candidate labels, but only one\nlabel is the ground-truth. However, this idealistic assumption may not always\nhold due to potential annotation inaccuracies, meaning the ground-truth may not\nbe present in the candidate label set. This is known as Unreliable Partial\nLabel Learning (UPLL) that introduces an additional complexity due to the\ninherent unreliability and ambiguity of partial labels, often resulting in a\nsub-optimal performance with existing methods. To address this challenge, we\npropose the Unreliability-Robust Representation Learning framework (URRL) that\nleverages unreliability-robust contrastive learning to help the model fortify\nagainst unreliable partial labels effectively. Concurrently, we propose a dual\nstrategy that combines KNN-based candidate label set correction and\nconsistency-regularization-based label disambiguation to refine label quality\nand enhance the ability of representation learning within the URRL framework.\nExtensive experiments demonstrate that the proposed method outperforms\nstate-of-the-art PLL methods on various datasets with diverse degrees of\nunreliability and ambiguity. Furthermore, we provide a theoretical analysis of\nour approach from the perspective of the expectation maximization (EM)\nalgorithm. Upon acceptance, we pledge to make the code publicly accessible.\n","authors":["Yu Shi","Dong-Dong Wu","Xin Geng","Min-Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00241v2","updated":"2023-08-31T13:36:21Z","published":"2023-04-29T11:46:53Z","title":"When Deep Learning Meets Polyhedral Theory: A Survey","summary":" In the past decade, deep learning became the prevalent methodology for\npredictive modeling thanks to the remarkable accuracy of deep neural networks\nin tasks such as computer vision and natural language processing. Meanwhile,\nthe structure of neural networks converged back to simpler representations\nbased on piecewise constant and piecewise linear functions such as the\nRectified Linear Unit (ReLU), which became the most commonly used type of\nactivation function in neural networks. That made certain types of network\nstructure $\\unicode{x2014}$such as the typical fully-connected feedforward\nneural network$\\unicode{x2014}$ amenable to analysis through polyhedral theory\nand to the application of methodologies such as Linear Programming (LP) and\nMixed-Integer Linear Programming (MILP) for a variety of purposes. In this\npaper, we survey the main topics emerging from this fast-paced area of work,\nwhich bring a fresh perspective to understanding neural networks in more detail\nas well as to applying linear optimization techniques to train, verify, and\nreduce the size of such networks.\n","authors":["Joey Huchette","Gonzalo Muñoz","Thiago Serra","Calvin Tsay"],"pdf_url":"https://arxiv.org/pdf/2305.00241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14505v2","updated":"2023-08-31T13:10:04Z","published":"2023-04-03T11:45:27Z","title":"Transformer-based interpretable multi-modal data fusion for skin lesion\n classification","summary":" A lot of deep learning (DL) research these days is mainly focused on\nimproving quantitative metrics regardless of other factors. In human-centered\napplications, like skin lesion classification in dermatology, DL-driven\nclinical decision support systems are still in their infancy due to the limited\ntransparency of their decision-making process. Moreover, the lack of procedures\nthat can explain the behavior of trained DL algorithms leads to almost no trust\nfrom clinical physicians. To diagnose skin lesions, dermatologists rely on\nvisual assessment of the disease and the data gathered from the patient's\nanamnesis. Data-driven algorithms dealing with multi-modal data are limited by\nthe separation of feature-level and decision-level fusion procedures required\nby convolutional architectures. To address this issue, we enable single-stage\nmulti-modal data fusion via the attention mechanism of transformer-based\narchitectures to aid in diagnosing skin diseases. Our method beats other\nstate-of-the-art single- and multi-modal DL architectures in image-rich and\npatient-data-rich environments. Additionally, the choice of the architecture\nenables native interpretability support for the classification task both in the\nimage and metadata domain with no additional modifications necessary.\n","authors":["Theodor Cheslerean-Boghiu","Melia-Evelina Fleischmann","Theresa Willem","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2304.14505v2.pdf","comment":"Submitted to IEEE JBHI in July 2023"},{"id":"http://arxiv.org/abs/2212.14424v2","updated":"2023-08-31T13:06:03Z","published":"2022-12-29T18:55:00Z","title":"Invertible normalizing flow neural networks by JKO scheme","summary":" Normalizing flow is a class of deep generative models for efficient sampling\nand density estimation. In practice, the flow often appears as a chain of\ninvertible neural network blocks; to facilitate training, existing works have\nregularized flow trajectories and designed special network architectures. The\ncurrent paper develops a neural ODE flow network inspired by the\nJordan-Kinderleherer-Otto (JKO) scheme, which allows efficient block-wise\ntraining of the residual blocks without sampling SDE trajectories or inner\nloops of score matching or variational learning. As the JKO scheme unfolds the\ndynamic of gradient flow, the proposed model naturally stacks residual network\nblocks one by one, reducing the memory load and difficulty in performing\nend-to-end deep flow network training. We also develop adaptive time\nreparameterization of the flow network with a progressive refinement of the\ntrajectory in probability space, which improves the model training efficiency\nand accuracy in practice. Using numerical experiments with synthetic and real\ndata, we show that the proposed JKO-iFlow model achieves similar or better\nperformance in generating new samples compared with the existing flow and\ndiffusion models at a significantly reduced computational and memory cost.\n","authors":["Chen Xu","Xiuyuan Cheng","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2212.14424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02329v3","updated":"2023-08-31T12:44:58Z","published":"2023-07-05T14:39:47Z","title":"Data-driven Predictive Latency for 5G: A Theoretical and Experimental\n Analysis Using Network Measurements","summary":" The advent of novel 5G services and applications with binding latency\nrequirements and guaranteed Quality of Service (QoS) hastened the need to\nincorporate autonomous and proactive decision-making in network management\nprocedures. The objective of our study is to provide a thorough analysis of\npredictive latency within 5G networks by utilizing real-world network data that\nis accessible to mobile network operators (MNOs). In particular, (i) we present\nan analytical formulation of the user-plane latency as a Hypoexponential\ndistribution, which is validated by means of a comparative analysis with\nempirical measurements, and (ii) we conduct experimental results of\nprobabilistic regression, anomaly detection, and predictive forecasting\nleveraging on emerging domains in Machine Learning (ML), such as Bayesian\nLearning (BL) and Machine Learning on Graphs (GML). We test our predictive\nframework using data gathered from scenarios of vehicular mobility, dense-urban\ntraffic, and social gathering events. Our results provide valuable insights\ninto the efficacy of predictive algorithms in practical applications.\n","authors":["Marco Skocaj","Francesca Conserva","Nicol Sarcone Grande","Andrea Orsi","Davide Micheli","Giorgio Ghinamo","Simone Bizzarri","Roberto Verdone"],"pdf_url":"https://arxiv.org/pdf/2307.02329v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11463v2","updated":"2023-08-31T12:42:43Z","published":"2023-05-19T06:33:57Z","title":"Generative Sliced MMD Flows with Riesz Kernels","summary":" Maximum mean discrepancy (MMD) flows suffer from high computational costs in\nlarge scale computations. In this paper, we show that MMD flows with Riesz\nkernels $K(x,y) = - \\Vert x-y\\Vert^r$, $r \\in (0,2)$ have exceptional\nproperties which allow their efficient computation. We prove that the MMD of\nRiesz kernels coincides with the MMD of their sliced version. As a consequence,\nthe computation of gradients of MMDs can be performed in the one-dimensional\nsetting. Here, for $r=1$, a simple sorting algorithm can be applied to reduce\nthe complexity from $O(MN+N^2)$ to $O((M+N)\\log(M+N))$ for two measures with\n$M$ and $N$ support points. As another interesting follow-up result, the MMD of\ncompactly supported measures can be estimated from above and below by the\nWasserstein-1 distance. For the implementations we approximate the gradient of\nthe sliced MMD by using only a finite number $P$ of slices. We show that the\nresulting error has complexity $O(\\sqrt{d/P})$, where $d$ is the data\ndimension. These results enable us to train generative models by approximating\nMMD gradient flows by neural networks even for image applications. We\ndemonstrate the efficiency of our model by image generation on MNIST,\nFashionMNIST and CIFAR10.\n","authors":["Johannes Hertrich","Christian Wald","Fabian Altekrüger","Paul Hagemann"],"pdf_url":"https://arxiv.org/pdf/2305.11463v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.08060v2","updated":"2023-08-31T12:41:13Z","published":"2021-12-15T11:55:11Z","title":"Leveraging Image-based Generative Adversarial Networks for Time Series\n Generation","summary":" Generative models for images have gained significant attention in computer\nvision and natural language processing due to their ability to generate\nrealistic samples from complex data distributions. To leverage the advances of\nimage-based generative models for the time series domain, we propose a\ntwo-dimensional image representation for time series, the Extended\nIntertemporal Return Plot (XIRP). Our approach captures the intertemporal time\nseries dynamics in a scale-invariant and invertible way, reducing training time\nand improving sample quality. We benchmark synthetic XIRPs obtained by an\noff-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image\nrepresentations and models regarding similarity and predictive ability metrics.\nOur novel, validated image representation for time series consistently and\nsignificantly outperforms a state-of-the-art RNN-based generative model\nregarding predictive ability. Further, we introduce an improved stochastic\ninversion to substantially improve simulation quality regardless of the\nrepresentation and provide the prospect of transfer potentials in other\ndomains.\n","authors":["Justin Hellermann","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2112.08060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16684v1","updated":"2023-08-31T12:38:29Z","published":"2023-08-31T12:38:29Z","title":"Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor\n Attack","summary":" The vulnerabilities to backdoor attacks have recently threatened the\ntrustworthiness of machine learning models in practical applications.\nConventional wisdom suggests that not everyone can be an attacker since the\nprocess of designing the trigger generation algorithm often involves\nsignificant effort and extensive experimentation to ensure the attack's\nstealthiness and effectiveness. Alternatively, this paper shows that there\nexists a more severe backdoor threat: anyone can exploit an easily-accessible\nalgorithm for silent backdoor attacks. Specifically, this attacker can employ\nthe widely-used lossy image compression from a plethora of compression tools to\neffortlessly inject a trigger pattern into an image without leaving any\nnoticeable trace; i.e., the generated triggers are natural artifacts. One does\nnot require extensive knowledge to click on the \"convert\" or \"save as\" button\nwhile using tools for lossy image compression. Via this attack, the adversary\ndoes not need to design a trigger generator as seen in prior works and only\nrequires poisoning the data. Empirically, the proposed attack consistently\nachieves 100% attack success rate in several benchmark datasets such as MNIST,\nCIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still\nachieve almost 100% attack success rate with very small (approximately 10%)\npoisoning rates in the clean label setting. The generated trigger of the\nproposed attack using one lossy compression algorithm is also transferable\nacross other related compression algorithms, exacerbating the severity of this\nbackdoor threat. This work takes another crucial step toward understanding the\nextensive risks of backdoor attacks in practice, urging practitioners to\ninvestigate similar attacks and relevant backdoor mitigation methods.\n","authors":["Sze Jue Yang","Quang Nguyen","Chee Seng Chan","Khoa Doan"],"pdf_url":"https://arxiv.org/pdf/2308.16684v1.pdf","comment":"14 pages. This paper shows everyone can mount a powerful and stealthy\n backdoor attack with the widely-used lossy image compression"},{"id":"http://arxiv.org/abs/2308.16681v1","updated":"2023-08-31T12:32:43Z","published":"2023-08-31T12:32:43Z","title":"Everything, Everywhere All in One Evaluation: Using Multiverse Analysis\n to Evaluate the Influence of Model Design Decisions on Algorithmic Fairness","summary":" A vast number of systems across the world use algorithmic decision making\n(ADM) to (partially) automate decisions that have previously been made by\nhumans. When designed well, these systems promise more objective decisions\nwhile saving large amounts of resources and freeing up human time. However,\nwhen ADM systems are not designed well, they can lead to unfair decisions which\ndiscriminate against societal groups. The downstream effects of ADMs critically\ndepend on the decisions made during the systems' design and implementation, as\nbiases in data can be mitigated or reinforced along the modeling pipeline. Many\nof these design decisions are made implicitly, without knowing exactly how they\nwill influence the final system. It is therefore important to make explicit the\ndecisions made during the design of ADM systems and understand how these\ndecisions affect the fairness of the resulting system.\n To study this issue, we draw on insights from the field of psychology and\nintroduce the method of multiverse analysis for algorithmic fairness. In our\nproposed method, we turn implicit design decisions into explicit ones and\ndemonstrate their fairness implications. By combining decisions, we create a\ngrid of all possible \"universes\" of decision combinations. For each of these\nuniverses, we compute metrics of fairness and performance. Using the resulting\ndataset, one can see how and which decisions impact fairness. We demonstrate\nhow multiverse analyses can be used to better understand variability and\nrobustness of algorithmic fairness using an exemplary case study of predicting\npublic health coverage of vulnerable populations for potential interventions.\nOur results illustrate how decisions during the design of a machine learning\nsystem can have surprising effects on its fairness and how to detect these\neffects using multiverse analysis.\n","authors":["Jan Simson","Florian Pfisterer","Christoph Kern"],"pdf_url":"https://arxiv.org/pdf/2308.16681v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16680v1","updated":"2023-08-31T12:32:34Z","published":"2023-08-31T12:32:34Z","title":"Branches of a Tree: Taking Derivatives of Programs with Discrete and\n Branching Randomness in High Energy Physics","summary":" We propose to apply several gradient estimation techniques to enable the\ndifferentiation of programs with discrete randomness in High Energy Physics.\nSuch programs are common in High Energy Physics due to the presence of\nbranching processes and clustering-based analysis. Thus differentiating such\nprograms can open the way for gradient based optimization in the context of\ndetector design optimization, simulator tuning, or data analysis and\nreconstruction optimization. We discuss several possible gradient estimation\nstrategies, including the recent Stochastic AD method, and compare them in\nsimplified detector design experiments. In doing so we develop, to the best of\nour knowledge, the first fully differentiable branching program.\n","authors":["Michael Kagan","Lukas Heinrich"],"pdf_url":"https://arxiv.org/pdf/2308.16680v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.16678v1","updated":"2023-08-31T12:29:24Z","published":"2023-08-31T12:29:24Z","title":"Dynamic nsNet2: Efficient Deep Noise Suppression with Early Exiting","summary":" Although deep learning has made strides in the field of deep noise\nsuppression, leveraging deep architectures on resource-constrained devices\nstill proved challenging. Therefore, we present an early-exiting model based on\nnsNet2 that provides several levels of accuracy and resource savings by halting\ncomputations at different stages. Moreover, we adapt the original architecture\nby splitting the information flow to take into account the injected dynamism.\nWe show the trade-offs between performance and computational complexity based\non established metrics.\n","authors":["Riccardo Miccini","Alaa Zniber","Clément Laroche","Tobias Piechowiak","Martin Schoeberl","Luca Pezzarossa","Ouassim Karrakchou","Jens Sparsø","Mounir Ghogho"],"pdf_url":"https://arxiv.org/pdf/2308.16678v1.pdf","comment":"Accepted at the MLSP 2023"},{"id":"http://arxiv.org/abs/2308.16671v1","updated":"2023-08-31T12:22:40Z","published":"2023-08-31T12:22:40Z","title":"Communication-Efficient Decentralized Federated Learning via One-Bit\n Compressive Sensing","summary":" Decentralized federated learning (DFL) has gained popularity due to its\npracticality across various applications. Compared to the centralized version,\ntraining a shared model among a large number of nodes in DFL is more\nchallenging, as there is no central server to coordinate the training process.\nEspecially when distributed nodes suffer from limitations in communication or\ncomputational resources, DFL will experience extremely inefficient and unstable\ntraining. Motivated by these challenges, in this paper, we develop a novel\nalgorithm based on the framework of the inexact alternating direction method\n(iADM). On one hand, our goal is to train a shared model with a sparsity\nconstraint. This constraint enables us to leverage one-bit compressive sensing\n(1BCS), allowing transmission of one-bit information among neighbour nodes. On\nthe other hand, communication between neighbour nodes occurs only at certain\nsteps, reducing the number of communication rounds. Therefore, the algorithm\nexhibits notable communication efficiency. Additionally, as each node selects\nonly a subset of neighbours to participate in the training, the algorithm is\nrobust against stragglers. Additionally, complex items are computed only once\nfor several consecutive steps and subproblems are solved inexactly using\nclosed-form solutions, resulting in high computational efficiency. Finally,\nnumerical experiments showcase the algorithm's effectiveness in both\ncommunication and computation.\n","authors":["Shenglong Zhou","Kaidi Xu","Geoffrey Ye Li"],"pdf_url":"https://arxiv.org/pdf/2308.16671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.09379v2","updated":"2023-08-31T12:22:15Z","published":"2022-06-19T11:12:30Z","title":"0/1 Deep Neural Networks via Block Coordinate Descent","summary":" The step function is one of the simplest and most natural activation\nfunctions for deep neural networks (DNNs). As it counts 1 for positive\nvariables and 0 for others, its intrinsic characteristics (e.g., discontinuity\nand no viable information of subgradients) impede its development for several\ndecades. Even if there is an impressive body of work on designing DNNs with\ncontinuous activation functions that can be deemed as surrogates of the step\nfunction, it is still in the possession of some advantageous properties, such\nas complete robustness to outliers and being capable of attaining the best\nlearning-theoretic guarantee of predictive accuracy. Hence, in this paper, we\naim to train DNNs with the step function used as an activation function (dubbed\nas 0/1 DNNs). We first reformulate 0/1 DNNs as an unconstrained optimization\nproblem and then solve it by a block coordinate descend (BCD) method. Moreover,\nwe acquire closed-form solutions for sub-problems of BCD as well as its\nconvergence properties. Furthermore, we also integrate\n$\\ell_{2,0}$-regularization into 0/1 DNN to accelerate the training process and\ncompress the network scale. As a result, the proposed algorithm has a high\nperformance on classifying MNIST and Fashion-MNIST datasets. As a result, the\nproposed algorithm has a desirable performance on classifying MNIST,\nFashionMNIST, Cifar10, and Cifar100 datasets.\n","authors":["Hui Zhang","Shenglong Zhou","Geoffrey Ye Li","Naihua Xiu"],"pdf_url":"https://arxiv.org/pdf/2206.09379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16664v1","updated":"2023-08-31T12:12:56Z","published":"2023-08-31T12:12:56Z","title":"What can we learn from quantum convolutional neural networks?","summary":" We can learn from analyzing quantum convolutional neural networks (QCNNs)\nthat: 1) working with quantum data can be perceived as embedding physical\nsystem parameters through a hidden feature map; 2) their high performance for\nquantum phase recognition can be attributed to generation of a very suitable\nbasis set during the ground state embedding, where quantum criticality of spin\nmodels leads to basis functions with rapidly changing features; 3) pooling\nlayers of QCNNs are responsible for picking those basis functions that can\ncontribute to forming a high-performing decision boundary, and the learning\nprocess corresponds to adapting the measurement such that few-qubit operators\nare mapped to full-register observables; 4) generalization of QCNN models\nstrongly depends on the embedding type, and that rotation-based feature maps\nwith the Fourier basis require careful feature engineering; 5) accuracy and\ngeneralization of QCNNs with readout based on a limited number of shots favor\nthe ground state embeddings and associated physics-informed models. We\ndemonstrate these points in simulation, where our results shed light on\nclassification for physical processes, relevant for applications in sensing.\nFinally, we show that QCNNs with properly chosen ground state embeddings can be\nused for fluid dynamics problems, expressing shock wave solutions with good\ngeneralization and proven trainability.\n","authors":["Chukwudubem Umeano","Annie E. Paine","Vincent E. Elfving","Oleksandr Kyriienko"],"pdf_url":"https://arxiv.org/pdf/2308.16664v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.16659v1","updated":"2023-08-31T11:58:13Z","published":"2023-08-31T11:58:13Z","title":"Autoencoder-based Online Data Quality Monitoring for the CMS\n Electromagnetic Calorimeter","summary":" The online Data Quality Monitoring system (DQM) of the CMS electromagnetic\ncalorimeter (ECAL) is a crucial operational tool that allows ECAL experts to\nquickly identify, localize, and diagnose a broad range of detector issues that\nwould otherwise hinder physics-quality data taking. Although the existing ECAL\nDQM system has been continuously updated to respond to new problems, it remains\none step behind newer and unforeseen issues. Using unsupervised deep learning,\na real-time autoencoder-based anomaly detection system is developed that is\nable to detect ECAL anomalies unseen in past data. After accounting for spatial\nvariations in the response of the ECAL and the temporal evolution of anomalies,\nthe new system is able to efficiently detect anomalies while maintaining an\nestimated false discovery rate between $10^{-2}$ to $10^{-4}$, beating existing\nbenchmarks by about two orders of magnitude. The real-world performance of the\nsystem is validated using anomalies found in 2018 and 2022 LHC collision data.\nAdditionally, first results from deploying the autoencoder-based system in the\nCMS online DQM workflow for the ECAL barrel during Run 3 of the LHC are\npresented, showing its promising performance in detecting obscure issues that\ncould have been missed in the existing DQM system.\n","authors":["Abhirami Harilal","Kyungmin Park","Michael Andrews","Manfred Paulini"],"pdf_url":"https://arxiv.org/pdf/2308.16659v1.pdf","comment":"Submitted to the Proceedings of 21st International Workshop on\n Advanced Computing and Analysis Techniques in Physics Research ACAT 2022\n conference"},{"id":"http://arxiv.org/abs/2210.09134v2","updated":"2023-08-31T11:49:17Z","published":"2022-10-17T14:34:42Z","title":"Principled Pruning of Bayesian Neural Networks through Variational Free\n Energy Minimization","summary":" Bayesian model reduction provides an efficient approach for comparing the\nperformance of all nested sub-models of a model, without re-evaluating any of\nthese sub-models. Until now, Bayesian model reduction has been applied mainly\nin the computational neuroscience community on simple models. In this paper, we\nformulate and apply Bayesian model reduction to perform principled pruning of\nBayesian neural networks, based on variational free energy minimization. Direct\napplication of Bayesian model reduction, however, gives rise to approximation\nerrors. Therefore, a novel iterative pruning algorithm is presented to\nalleviate the problems arising with naive Bayesian model reduction, as\nsupported experimentally on the publicly available UCI datasets for different\ninference algorithms. This novel parameter pruning scheme solves the\nshortcomings of current state-of-the-art pruning methods that are used by the\nsignal processing community. The proposed approach has a clear stopping\ncriterion and minimizes the same objective that is used during training. Next\nto these benefits, our experiments indicate better model performance in\ncomparison to state-of-the-art pruning schemes.\n","authors":["Jim Beckers","Bart van Erp","Ziyue Zhao","Kirill Kondrashov","Bert de Vries"],"pdf_url":"https://arxiv.org/pdf/2210.09134v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16648v1","updated":"2023-08-31T11:44:40Z","published":"2023-08-31T11:44:40Z","title":"Generate Your Own Scotland: Satellite Image Generation Conditioned on\n Maps","summary":" Despite recent advancements in image generation, diffusion models still\nremain largely underexplored in Earth Observation. In this paper we show that\nstate-of-the-art pretrained diffusion models can be conditioned on cartographic\ndata to generate realistic satellite images. We provide two large datasets of\npaired OpenStreetMap images and satellite views over the region of Mainland\nScotland and the Central Belt. We train a ControlNet model and qualitatively\nevaluate the results, demonstrating that both image quality and map fidelity\nare possible. Finally, we provide some insights on the opportunities and\nchallenges of applying these models for remote sensing. Our model weights and\ncode for creating the dataset are publicly available at\nhttps://github.com/miquel-espinosa/map-sat.\n","authors":["Miguel Espinosa","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2308.16648v1.pdf","comment":"13 pages, 6 figures. preprint"},{"id":"http://arxiv.org/abs/2306.05727v2","updated":"2023-08-31T10:54:50Z","published":"2023-06-09T07:48:36Z","title":"The Role of Diverse Replay for Generalisation in Reinforcement Learning","summary":" In reinforcement learning (RL), key components of many algorithms are the\nexploration strategy and replay buffer. These strategies regulate what\nenvironment data is collected and trained on and have been extensively studied\nin the RL literature. In this paper, we investigate the impact of these\ncomponents in the context of generalisation in multi-task RL. We investigate\nthe hypothesis that collecting and training on more diverse data from the\ntraining environments will improve zero-shot generalisation to new tasks. We\nmotivate mathematically and show empirically that generalisation to tasks that\nare \"reachable'' during training is improved by increasing the diversity of\ntransitions in the replay buffer. Furthermore, we show empirically that this\nsame strategy also shows improvement for generalisation to similar but\n\"unreachable'' tasks which could be due to improved generalisation of the\nlearned latent representations.\n","authors":["Max Weltevrede","Matthijs T. J. Spaan","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2306.05727v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2306.05109v2","updated":"2023-08-31T10:13:12Z","published":"2023-06-08T11:16:20Z","title":"Yet Another ICU Benchmark: A Flexible Multi-Center Framework for\n Clinical ML","summary":" Medical applications of machine learning (ML) have experienced a surge in\npopularity in recent years. The intensive care unit (ICU) is a natural habitat\nfor ML given the abundance of available data from electronic health records.\nModels have been proposed to address numerous ICU prediction tasks like the\nearly detection of complications. While authors frequently report\nstate-of-the-art performance, it is challenging to verify claims of\nsuperiority. Datasets and code are not always published, and cohort\ndefinitions, preprocessing pipelines, and training setups are difficult to\nreproduce. This work introduces Yet Another ICU Benchmark (YAIB), a modular\nframework that allows researchers to define reproducible and comparable\nclinical ML experiments; we offer an end-to-end solution from cohort definition\nto model evaluation. The framework natively supports most open-access ICU\ndatasets (MIMIC III/IV, eICU, HiRID, AUMCdb) and is easily adaptable to future\nICU datasets. Combined with a transparent preprocessing pipeline and extensible\ntraining code for multiple ML and deep learning models, YAIB enables unified\nmodel development. Our benchmark comes with five predefined established\nprediction tasks (mortality, acute kidney injury, sepsis, kidney function, and\nlength of stay) developed in collaboration with clinicians. Adding further\ntasks is straightforward by design. Using YAIB, we demonstrate that the choice\nof dataset, cohort definition, and preprocessing have a major impact on the\nprediction performance - often more so than model class - indicating an urgent\nneed for YAIB as a holistic benchmarking tool. We provide our work to the\nclinical ML community to accelerate method development and enable real-world\nclinical implementations. Software Repository:\nhttps://github.com/rvandewater/YAIB.\n","authors":["Robin van de Water","Hendrik Schmidt","Paul Elbers","Patrick Thoral","Bert Arnrich","Patrick Rockenschaub"],"pdf_url":"https://arxiv.org/pdf/2306.05109v2.pdf","comment":"Main benchmark: https://github.com/rvandewater/YAIB, Cohort\n generation: https://github.com/rvandewater/YAIB-cohorts, Models:\n https://github.com/rvandewater/YAIB-models"},{"id":"http://arxiv.org/abs/2308.16609v1","updated":"2023-08-31T10:12:32Z","published":"2023-08-31T10:12:32Z","title":"Towards Long-Tailed Recognition for Graph Classification via\n Collaborative Experts","summary":" Graph classification, aiming at learning the graph-level representations for\neffective class assignments, has received outstanding achievements, which\nheavily relies on high-quality datasets that have balanced class distribution.\nIn fact, most real-world graph data naturally presents a long-tailed form,\nwhere the head classes occupy much more samples than the tail classes, it thus\nis essential to study the graph-level classification over long-tailed data\nwhile still remaining largely unexplored. However, most existing long-tailed\nlearning methods in visions fail to jointly optimize the representation\nlearning and classifier training, as well as neglect the mining of the\nhard-to-classify classes. Directly applying existing methods to graphs may lead\nto sub-optimal performance, since the model trained on graphs would be more\nsensitive to the long-tailed distribution due to the complex topological\ncharacteristics. Hence, in this paper, we propose a novel long-tailed\ngraph-level classification framework via Collaborative Multi-expert Learning\n(CoMe) to tackle the problem. To equilibrate the contributions of head and tail\nclasses, we first develop balanced contrastive learning from the view of\nrepresentation learning, and then design an individual-expert classifier\ntraining based on hard class mining. In addition, we execute gated fusion and\ndisentangled knowledge distillation among the multiple experts to promote the\ncollaboration in a multi-expert framework. Comprehensive experiments are\nperformed on seven widely-used benchmark datasets to demonstrate the\nsuperiority of our method CoMe over state-of-the-art baselines.\n","authors":["Siyu Yi","Zhengyang Mao","Wei Ju","Yongdao Zhou","Luchen Liu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16609v1.pdf","comment":"Accepted by IEEE Transactions on Big Data (TBD 2024)"},{"id":"http://arxiv.org/abs/2308.16599v1","updated":"2023-08-31T09:57:52Z","published":"2023-08-31T09:57:52Z","title":"A Causal Discovery Approach To Learn How Urban Form Shapes Sustainable\n Mobility Across Continents","summary":" Global sustainability requires low-carbon urban transport systems, shaped by\nadequate infrastructure, deployment of low-carbon transport modes and shifts in\ntravel behavior. To adequately implement alterations in infrastructure, it's\nessential to grasp the location-specific cause-and-effect mechanisms that the\nconstructed environment has on travel. Yet, current research falls short in\nrepresenting causal relationships between the 6D urban form variables and\ntravel, generalizing across different regions, and modeling urban form effects\nat high spatial resolution. Here, we address all three gaps by utilizing a\ncausal discovery and an explainable machine learning framework to detect urban\nform effects on intra-city travel based on high-resolution mobility data of six\ncities across three continents. We show that both distance to city center,\ndemographics and density indirectly affect other urban form features. By\nconsidering the causal relationships, we find that location-specific influences\nalign across cities, yet vary in magnitude. In addition, the spread of the city\nand the coverage of jobs across the city are the strongest determinants of\ntravel-related emissions, highlighting the benefits of compact development and\nassociated benefits. Differences in urban form effects across the cities call\nfor a more holistic definition of 6D measures. Our work is a starting point for\nlocation-specific analysis of urban form effects on mobility behavior using\ncausal discovery approaches, which is highly relevant for city planners and\nmunicipalities across continents.\n","authors":["Felix Wagner","Florian Nachtigall","Lukas Franken","Nikola Milojevic-Dupont","Rafael H. M. Pereira","Nicolas Koch","Jakob Runge","Marta Gonzalez","Felix Creutzig"],"pdf_url":"https://arxiv.org/pdf/2308.16599v1.pdf","comment":"22 pages, 13 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.16598v1","updated":"2023-08-31T09:57:27Z","published":"2023-08-31T09:57:27Z","title":"Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation","summary":" Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential\nrole in the early diagnosis and treatment of liver cancer. Deep learning models\nbackboned by fully convolutional neural networks (FCNNs) have become the\ndominant model for segmenting 3D computerized tomography (CT) scans. However,\nsince their convolution layers suffer from limited kernel size, they are not\nable to capture long-range dependencies and global context. To tackle this\nrestriction, vision transformers have been introduced to solve FCNN's locality\nof receptive fields. Although transformers can capture long-range features,\ntheir segmentation performance decreases with various tumor sizes due to the\nmodel sensitivity to the input patch size. While finding an optimal patch size\nimproves the performance of vision transformer-based models on segmentation\ntasks, it is a time-consuming and challenging procedure. This paper proposes a\ntechnique to select the vision transformer's optimal input multi-resolution\nimage patch size based on the average volume size of metastasis lesions. We\nfurther validated our suggested framework using a transfer-learning technique,\ndemonstrating that the highest Dice similarity coefficient (DSC) performance\nwas obtained by pre-training on training data with a larger tumour volume using\nthe suggested ideal patch size and then training with a smaller one. We\nexperimentally evaluate this idea through pre-training our model on a\nmulti-resolution public dataset. Our model showed consistent and improved\nresults when applied to our private multi-resolution mCRC dataset with a\nsmaller average tumor volume. This study lays the groundwork for optimizing\nsemantic segmentation of small objects using vision transformers. The\nimplementation source code is available\nat:https://github.com/Ramtin-Mojtahedi/OVTPS.\n","authors":["Ramtin Mojtahedi","Mohammad Hamghalam","Richard K. G. Do","Amber L. Simpson"],"pdf_url":"https://arxiv.org/pdf/2308.16598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16593v1","updated":"2023-08-31T09:50:33Z","published":"2023-08-31T09:50:33Z","title":"Towards Spontaneous Style Modeling with Semi-supervised Pre-training for\n Conversational Text-to-Speech Synthesis","summary":" The spontaneous behavior that often occurs in conversations makes speech more\nhuman-like compared to reading-style. However, synthesizing spontaneous-style\nspeech is challenging due to the lack of high-quality spontaneous datasets and\nthe high cost of labeling spontaneous behavior. In this paper, we propose a\nsemi-supervised pre-training method to increase the amount of spontaneous-style\nspeech and spontaneous behavioral labels. In the process of semi-supervised\nlearning, both text and speech information are considered for detecting\nspontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is\nused to model the relationship between each sentence in the conversation.\nExperimental results indicate that our proposed method achieves superior\nexpressive speech synthesis performance with the ability to model spontaneous\nbehavior in spontaneous-style speech and predict reasonable spontaneous\nbehavior from text.\n","authors":["Weiqin Li","Shun Lei","Qiaochu Huang","Yixuan Zhou","Zhiyong Wu","Shiyin Kang","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2308.16593v1.pdf","comment":"Accepted by INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2305.08396v4","updated":"2023-08-31T09:43:37Z","published":"2023-05-15T07:23:54Z","title":"MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation","summary":" In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision\ntransformer (CNN-Transformer) for medical image segmentation. The proposed\nHybrid Decoder, based on MaxViT-block, is designed to harness the power of both\nthe convolution and self-attention mechanisms at each decoding stage with a\nnominal memory and computational burden. The inclusion of multi-axis\nself-attention, within each decoder stage, significantly enhances the\ndiscriminating capacity between the object and background regions, thereby\nhelping in improving the segmentation efficiency. In the Hybrid Decoder block,\nthe fusion process commences by integrating the upsampled lower-level decoder\nfeatures, obtained through transpose convolution, with the skip-connection\nfeatures derived from the hybrid encoder. Subsequently, the fused features\nundergo refinement through the utilization of a multi-axis attention mechanism.\nThe proposed decoder block is repeated multiple times to progressively segment\nthe nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset\ndemonstrates the effectiveness of the proposed technique. Our MaxViT-UNet\noutperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet)\ntechniques by a considerable margin on both of the standard datasets. The\nfollowing github (https://github.com/PRLAB21/MaxViT-UNet) contains the\nimplementation and trained weights.\n","authors":["Abdul Rehman Khan","Asifullah Khan"],"pdf_url":"https://arxiv.org/pdf/2305.08396v4.pdf","comment":"17 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.16585v1","updated":"2023-08-31T09:30:06Z","published":"2023-08-31T09:30:06Z","title":"Development and validation of an interpretable machine learning-based\n calculator for predicting 5-year weight trajectories after bariatric surgery:\n a multinational retrospective cohort SOPHIA study","summary":" Background Weight loss trajectories after bariatric surgery vary widely\nbetween individuals, and predicting weight loss before the operation remains\nchallenging. We aimed to develop a model using machine learning to provide\nindividual preoperative prediction of 5-year weight loss trajectories after\nsurgery. Methods In this multinational retrospective observational study we\nenrolled adult participants (aged $\\ge$18 years) from ten prospective cohorts\n(including ABOS [NCT01129297], BAREVAL [NCT02310178], the Swedish Obese\nSubjects study, and a large cohort from the Dutch Obesity Clinic [Nederlandse\nObesitas Kliniek]) and two randomised trials (SleevePass [NCT00793143] and\nSM-BOSS [NCT00356213]) in Europe, the Americas, and Asia, with a 5 year\nfollowup after Roux-en-Y gastric bypass, sleeve gastrectomy, or gastric band.\nPatients with a previous history of bariatric surgery or large delays between\nscheduled and actual visits were excluded. The training cohort comprised\npatients from two centres in France (ABOS and BAREVAL). The primary outcome was\nBMI at 5 years. A model was developed using least absolute shrinkage and\nselection operator to select variables and the classification and regression\ntrees algorithm to build interpretable regression trees. The performances of\nthe model were assessed through the median absolute deviation (MAD) and root\nmean squared error (RMSE) of BMI. Findings10 231 patients from 12 centres in\nten countries were included in the analysis, corresponding to 30 602\npatient-years. Among participants in all 12 cohorts, 7701 (75$\\bullet$3%) were\nfemale, 2530 (24$\\bullet$7%) were male. Among 434 baseline attributes available\nin the training cohort, seven variables were selected: height, weight,\nintervention type, age, diabetes status, diabetes duration, and smoking status.\nAt 5 years, across external testing cohorts the overall mean MAD BMI was\n2$\\bullet$8 kg/m${}^2$ (95% CI 2$\\bullet$6-3$\\bullet$0) and mean RMSE BMI was\n4$\\bullet$7 kg/m${}^2$ (4$\\bullet$4-5$\\bullet$0), and the mean difference\nbetween predicted and observed BMI was-0$\\bullet$3 kg/m${}^2$ (SD 4$\\bullet$7).\nThis model is incorporated in an easy to use and interpretable web-based\nprediction tool to help inform clinical decision before surgery.\nInterpretationWe developed a machine learning-based model, which is\ninternationally validated, for predicting individual 5-year weight loss\ntrajectories after three common bariatric interventions.\n","authors":["Patrick Saux","Pierre Bauvin","Violeta Raverdy","Julien Teigny","Hélène Verkindt","Tomy Soumphonphakdy","Maxence Debert","Anne Jacobs","Daan Jacobs","Valerie Monpellier","Phong Ching Lee","Chin Hong Lim","Johanna C Andersson-Assarsson","Lena Carlsson","Per-Arne Svensson","Florence Galtier","Guelareh Dezfoulian","Mihaela Moldovanu","Severine Andrieux","Julien Couster","Marie Lepage","Erminia Lembo","Ornella Verrastro","Maud Robert","Paulina Salminen","Geltrude Mingrone","Ralph Peterli","Ricardo V Cohen","Carlos Zerrweck","David Nocca","Carel W Le Roux","Robert Caiazzo","Philippe Preux","François Pattou"],"pdf_url":"https://arxiv.org/pdf/2308.16585v1.pdf","comment":"The Lancet Digital Health, 2023"},{"id":"http://arxiv.org/abs/2308.16572v1","updated":"2023-08-31T09:13:30Z","published":"2023-08-31T09:13:30Z","title":"CL-MAE: Curriculum-Learned Masked Autoencoders","summary":" Masked image modeling has been demonstrated as a powerful pretext task for\ngenerating robust representations that can be effectively generalized across\nmultiple downstream tasks. Typically, this approach involves randomly masking\npatches (tokens) in input images, with the masking strategy remaining unchanged\nduring training. In this paper, we propose a curriculum learning approach that\nupdates the masking strategy to continually increase the complexity of the\nself-supervised reconstruction task. We conjecture that, by gradually\nincreasing the task complexity, the model can learn more sophisticated and\ntransferable representations. To facilitate this, we introduce a novel\nlearnable masking module that possesses the capability to generate masks of\ndifferent complexities, and integrate the proposed module into masked\nautoencoders (MAE). Our module is jointly trained with the MAE, while adjusting\nits behavior during training, transitioning from a partner to the MAE\n(optimizing the same reconstruction loss) to an adversary (optimizing the\nopposite loss), while passing through a neutral state. The transition between\nthese behaviors is smooth, being regulated by a factor that is multiplied with\nthe reconstruction loss of the masking module. The resulting training procedure\ngenerates an easy-to-hard curriculum. We train our Curriculum-Learned Masked\nAutoencoder (CL-MAE) on ImageNet and show that it exhibits superior\nrepresentation learning capabilities compared to MAE. The empirical results on\nfive downstream tasks confirm our conjecture, demonstrating that curriculum\nlearning can be successfully used to self-supervise masked autoencoders.\n","authors":["Neelu Madan","Nicolae-Catalin Ristea","Kamal Nasrollahi","Thomas B. Moeslund","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2308.16572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16571v1","updated":"2023-08-31T09:12:34Z","published":"2023-08-31T09:12:34Z","title":"Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based\n Approach","summary":" In the rapidly evolving digital era, the analysis of document layouts plays a\npivotal role in automated information extraction and interpretation. In our\nwork, we have trained MViTv2 transformer model architecture with cascaded mask\nR-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from\na document. After training on 20365 document images for 36 epochs in a 3 phase\ncycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work\nextends beyond training, delving into the exploration of potential enhancement\navenues. We investigate the impact of rotation and flip augmentation, the\neffectiveness of slicing input images pre-inference, the implications of\nvarying the resolution of the transformer backbone, and the potential of\nemploying a dual-pass inference to uncover missed text-boxes. Through these\nexplorations, we observe a spectrum of outcomes, where some modifications\nresult in tangible performance improvements, while others offer unique insights\nfor future endeavors.\n","authors":["Ashrafur Rahman Khan","Asif Azad"],"pdf_url":"https://arxiv.org/pdf/2308.16571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16570v1","updated":"2023-08-31T09:12:30Z","published":"2023-08-31T09:12:30Z","title":"MONDEO: Multistage Botnet Detection","summary":" Mobile devices have widespread to become the most used piece of technology.\nDue to their characteristics, they have become major targets for botnet-related\nmalware. FluBot is one example of botnet malware that infects mobile devices.\nIn particular, FluBot is a DNS-based botnet that uses Domain Generation\nAlgorithms (DGA) to establish communication with the Command and Control Server\n(C2). MONDEO is a multistage mechanism with a flexible design to detect\nDNS-based botnet malware. MONDEO is lightweight and can be deployed without\nrequiring the deployment of software, agents, or configuration in mobile\ndevices, allowing easy integration in core networks. MONDEO comprises four\ndetection stages: Blacklisting/Whitelisting, Query rate analysis, DGA analysis,\nand Machine learning evaluation. It was created with the goal of processing\nstreams of packets to identify attacks with high efficiency, in the distinct\nphases. MONDEO was tested against several datasets to measure its efficiency\nand performance, being able to achieve high performance with RandomForest\nclassifiers. The implementation is available at github.\n","authors":["Duarte Dias","Bruno Sousa","Nuno Antunes"],"pdf_url":"https://arxiv.org/pdf/2308.16570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.00262v2","updated":"2023-08-31T09:01:35Z","published":"2023-03-01T06:35:42Z","title":"Collage Diffusion","summary":" We seek to give users precise control over diffusion-based image generation\nby modeling complex scenes as sequences of layers, which define the desired\nspatial arrangement and visual attributes of objects in the scene. Collage\nDiffusion harmonizes the input layers to make objects fit together -- the key\nchallenge involves minimizing changes in the positions and key visual\nattributes of the input layers while allowing other attributes to change in the\nharmonization process. We ensure that objects are generated in the correct\nlocations by modifying text-image cross-attention with the layers' alpha masks.\nWe preserve key visual attributes of input layers by learning specialized text\nrepresentations per layer and by extending ControlNet to operate on layers.\nLayer input allows users to control the extent of image harmonization on a\nper-object basis, and users can even iteratively edit individual objects in\ngenerated images while keeping other objects fixed. By leveraging the rich\ninformation present in layer input, Collage Diffusion generates globally\nharmonized images that maintain desired object characteristics better than\nprior approaches.\n","authors":["Vishnu Sarukkai","Linden Li","Arden Ma","Christopher Ré","Kayvon Fatahalian"],"pdf_url":"https://arxiv.org/pdf/2303.00262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15487v3","updated":"2023-08-31T08:58:17Z","published":"2023-03-27T07:53:43Z","title":"Knowledge Enhanced Graph Neural Networks for Graph Completion","summary":" Graph data is omnipresent and has a wide variety of applications, such as in\nnatural science, social networks, or the semantic web. However, while being\nrich in information, graphs are often noisy and incomplete. As a result, graph\ncompletion tasks, such as node classification or link prediction, have gained\nattention. On one hand, neural methods, such as graph neural networks, have\nproven to be robust tools for learning rich representations of noisy graphs. On\nthe other hand, symbolic methods enable exact reasoning on graphs.We propose\nKnowledge Enhanced Graph Neural Networks (KeGNN), a neuro-symbolic framework\nfor graph completion that combines both paradigms as it allows for the\nintegration of prior knowledge into a graph neural network model.Essentially,\nKeGNN consists of a graph neural network as a base upon which knowledge\nenhancement layers are stacked with the goal of refining predictions with\nrespect to prior knowledge.We instantiate KeGNN in conjunction with two\nstate-of-the-art graph neural networks, Graph Convolutional Networks and Graph\nAttention Networks, and evaluate KeGNN on multiple benchmark datasets for node\nclassification.\n","authors":["Luisa Werner","Nabil Layaïda","Pierre Genevès","Sarah Chlyah"],"pdf_url":"https://arxiv.org/pdf/2303.15487v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05102v2","updated":"2023-08-31T08:43:17Z","published":"2023-03-09T08:21:50Z","title":"StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent\n Disentangled Space","summary":" One major challenge in machine learning applications is coping with\nmismatches between the datasets used in the development and those obtained in\nreal-world applications. These mismatches may lead to inaccurate predictions\nand errors, resulting in poor product quality and unreliable systems. In this\nstudy, we propose StyleDiff to inform developers of the differences between the\ntwo datasets for the steady development of machine learning systems. Using\ndisentangled image spaces obtained from recently proposed generative models,\nStyleDiff compares the two datasets by focusing on attributes in the images and\nprovides an easy-to-understand analysis of the differences between the\ndatasets. The proposed StyleDiff performs in $O (d N\\log N)$, where $N$ is the\nsize of the datasets and $d$ is the number of attributes, enabling the\napplication to large datasets. We demonstrate that StyleDiff accurately detects\ndifferences between datasets and presents them in an understandable format\nusing, for example, driving scenes datasets.\n","authors":["Keisuke Kawano","Takuro Kutsuna","Ryoko Tokuhisa","Akihiro Nakamura","Yasushi Esaki"],"pdf_url":"https://arxiv.org/pdf/2303.05102v2.pdf","comment":"25 pages, 17 figures, Image and Vision Computing"},{"id":"http://arxiv.org/abs/2308.16544v1","updated":"2023-08-31T08:34:20Z","published":"2023-08-31T08:34:20Z","title":"Forecasting Emergency Department Crowding with Advanced Machine Learning\n Models and Multivariable Input","summary":" Emergency department (ED) crowding is a significant threat to patient safety\nand it has been repeatedly associated with increased mortality. Forecasting\nfuture service demand has the potential patient outcomes. Despite active\nresearch on the subject, several gaps remain: 1) proposed forecasting models\nhave become outdated due to quick influx of advanced machine learning models\n(ML), 2) amount of multivariable input data has been limited and 3) discrete\nperformance metrics have been rarely reported. In this study, we document the\nperformance of a set of advanced ML models in forecasting ED occupancy 24 hours\nahead. We use electronic health record data from a large, combined ED with an\nextensive set of explanatory variables, including the availability of beds in\ncatchment area hospitals, traffic data from local observation stations, weather\nvariables, etc. We show that N-BEATS and LightGBM outpeform benchmarks with 11\n% and 9 % respective improvements and that DeepAR predicts next day crowding\nwith an AUC of 0.76 (95 % CI 0.69-0.84). To the best of our knowledge, this is\nthe first study to document the superiority of LightGBM and N-BEATS over\nstatistical benchmarks in the context of ED forecasting.\n","authors":["Jalmari Tuominen","Eetu Pulkkinen","Jaakko Peltonen","Juho Kanniainen","Niku Oksala","Ari Palomäki","Antti Roine"],"pdf_url":"https://arxiv.org/pdf/2308.16544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16541v1","updated":"2023-08-31T08:30:26Z","published":"2023-08-31T08:30:26Z","title":"Scalable Incomplete Multi-View Clustering with Structure Alignment","summary":" The success of existing multi-view clustering (MVC) relies on the assumption\nthat all views are complete. However, samples are usually partially available\ndue to data corruption or sensor malfunction, which raises the research of\nincomplete multi-view clustering (IMVC). Although several anchor-based IMVC\nmethods have been proposed to process the large-scale incomplete data, they\nstill suffer from the following drawbacks: i) Most existing approaches neglect\nthe inter-view discrepancy and enforce cross-view representation to be\nconsistent, which would corrupt the representation capability of the model; ii)\nDue to the samples disparity between different views, the learned anchor might\nbe misaligned, which we referred as the Anchor-Unaligned Problem for Incomplete\ndata (AUP-ID). Such the AUP-ID would cause inaccurate graph fusion and degrades\nclustering performance. To tackle these issues, we propose a novel incomplete\nanchor graph learning framework termed Scalable Incomplete Multi-View\nClustering with Structure Alignment (SIMVC-SA). Specially, we construct the\nview-specific anchor graph to capture the complementary information from\ndifferent views. In order to solve the AUP-ID, we propose a novel structure\nalignment module to refine the cross-view anchor correspondence. Meanwhile, the\nanchor graph construction and alignment are jointly optimized in our unified\nframework to enhance clustering quality. Through anchor graph construction\ninstead of full graphs, the time and space complexity of the proposed SIMVC-SA\nis proven to be linearly correlated with the number of samples. Extensive\nexperiments on seven incomplete benchmark datasets demonstrate the\neffectiveness and efficiency of our proposed method. Our code is publicly\navailable at https://github.com/wy1019/SIMVC-SA.\n","authors":["Yi Wen","Siwei Wang","Ke Liang","Weixuan Liang","Xinhang Wan","Xinwang Liu","Suyuan Liu","Jiyuan Liu","En Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.16541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16539v1","updated":"2023-08-31T08:30:11Z","published":"2023-08-31T08:30:11Z","title":"On a Connection between Differential Games, Optimal Control, and\n Energy-based Models for Multi-Agent Interactions","summary":" Game theory offers an interpretable mathematical framework for modeling\nmulti-agent interactions. However, its applicability in real-world robotics\napplications is hindered by several challenges, such as unknown agents'\npreferences and goals. To address these challenges, we show a connection\nbetween differential games, optimal control, and energy-based models and\ndemonstrate how existing approaches can be unified under our proposed\nEnergy-based Potential Game formulation. Building upon this formulation, this\nwork introduces a new end-to-end learning application that combines neural\nnetworks for game-parameter inference with a differentiable game-theoretic\noptimization layer, acting as an inductive bias. The experiments using\nsimulated mobile robot pedestrian interactions and real-world automated driving\ndata provide empirical evidence that the game-theoretic layer improves the\npredictive performance of various neural network backbones.\n","authors":["Christopher Diehl","Tobias Klosek","Martin Krüger","Nils Murzyn","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2308.16539v1.pdf","comment":"International Conference on Machine Learning, Workshop on New\n Frontiers in Learning, Control, and Dynamical Systems (ICML 2023\n Frontiers4LCD)"},{"id":"http://arxiv.org/abs/2308.16534v1","updated":"2023-08-31T08:25:47Z","published":"2023-08-31T08:25:47Z","title":"Conditioning Score-Based Generative Models by Neuro-Symbolic Constraints","summary":" Score-based and diffusion models have emerged as effective approaches for\nboth conditional and unconditional generation. Still conditional generation is\nbased on either a specific training of a conditional model or classifier\nguidance, which requires training a noise-dependent classifier, even when the\nclassifier for uncorrupted data is given. We propose an approach to sample from\nunconditional score-based generative models enforcing arbitrary logical\nconstraints, without any additional training. Firstly, we show how to\nmanipulate the learned score in order to sample from an un-normalized\ndistribution conditional on a user-defined constraint. Then, we define a\nflexible and numerically stable neuro-symbolic framework for encoding soft\nlogical constraints. Combining these two ingredients we obtain a general, but\napproximate, conditional sampling algorithm. We further developed effective\nheuristics aimed at improving the approximation. Finally, we show the\neffectiveness of our approach for various types of constraints and data:\ntabular data, images and time series.\n","authors":["Davide Scassola","Sebastiano Saccani","Ginevra Carbone","Luca Bortolussi"],"pdf_url":"https://arxiv.org/pdf/2308.16534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16528v1","updated":"2023-08-31T08:19:26Z","published":"2023-08-31T08:19:26Z","title":"SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded\n Objects","summary":" To enable meaningful robotic manipulation of objects in the real-world, 6D\npose estimation is one of the critical aspects. Most existing approaches have\ndifficulties to extend predictions to scenarios where novel object instances\nare continuously introduced, especially with heavy occlusions. In this work, we\npropose a few-shot pose estimation (FSPE) approach called SA6D, which uses a\nself-adaptive segmentation module to identify the novel target object and\nconstruct a point cloud model of the target object using only a small number of\ncluttered reference images. Unlike existing methods, SA6D does not require\nobject-centric reference images or any additional object information, making it\na more generalizable and scalable solution across categories. We evaluate SA6D\non real-world tabletop object datasets and demonstrate that SA6D outperforms\nexisting FSPE methods, particularly in cluttered scenes with occlusions, while\nrequiring fewer reference images.\n","authors":["Ning Gao","Ngo Anh Vien","Hanna Ziesche","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.16528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08566v2","updated":"2023-08-31T08:17:57Z","published":"2023-03-15T12:34:24Z","title":"Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning","summary":" Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful\nalternative for full fine-tuning so as to adapt pre-trained vision models to\ndownstream tasks, which only tunes a small number of parameters while freezing\nthe vast majority ones to ease storage burden and optimization difficulty.\nHowever, existing PEFT methods introduce trainable parameters to the same\npositions across different tasks depending solely on human heuristics and\nneglect the domain gaps. To this end, we study where to introduce and how to\nallocate trainable parameters by proposing a novel Sensitivity-aware visual\nParameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates\ntrainable parameters to task-specific important positions given a desired\ntunable parameter budget. Specifically, our SPT first quickly identifies the\nsensitive parameters that require tuning for a given task in a data-dependent\nway. Next, our SPT further boosts the representational capability for the\nweight matrices whose number of sensitive parameters exceeds a pre-defined\nthreshold by utilizing existing structured tuning methods, e.g., LoRA [23] or\nAdapter [22], to replace directly tuning the selected sensitive parameters\n(unstructured tuning) under the budget. Extensive experiments on a wide range\nof downstream recognition tasks show that our SPT is complementary to the\nexisting PEFT methods and largely boosts their performance, e.g., SPT improves\nAdapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean\nTop-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks,\nrespectively. Source code is at https://github.com/ziplab/SPT\n","authors":["Haoyu He","Jianfei Cai","Jing Zhang","Dacheng Tao","Bohan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2303.08566v2.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2305.19979v2","updated":"2023-08-31T08:02:35Z","published":"2023-05-31T16:04:25Z","title":"Knowledge Graph Embeddings in the Biomedical Domain: Are They Useful? A\n Look at Link Prediction, Rule Learning, and Downstream Polypharmacy Tasks","summary":" Knowledge graphs are powerful tools for representing and organising complex\nbiomedical data. Several knowledge graph embedding algorithms have been\nproposed to learn from and complete knowledge graphs. However, a recent study\ndemonstrates the limited efficacy of these embedding algorithms when applied to\nbiomedical knowledge graphs, raising the question of whether knowledge graph\nembeddings have limitations in biomedical settings. This study aims to apply\nstate-of-the-art knowledge graph embedding models in the context of a recent\nbiomedical knowledge graph, BioKG, and evaluate their performance and potential\ndownstream uses. We achieve a three-fold improvement in terms of performance\nbased on the HITS@10 score over previous work on the same biomedical knowledge\ngraph. Additionally, we provide interpretable predictions through a rule-based\nmethod. We demonstrate that knowledge graph embedding models are applicable in\npractice by evaluating the best-performing model on four tasks that represent\nreal-life polypharmacy situations. Results suggest that knowledge learnt from\nlarge biomedical knowledge graphs can be transferred to such downstream use\ncases. Our code is available at https://github.com/aryopg/biokge.\n","authors":["Aryo Pradipta Gema","Dominik Grabarczyk","Wolf De Wulf","Piyush Borole","Javier Antonio Alfaro","Pasquale Minervini","Antonio Vergari","Ajitha Rajan"],"pdf_url":"https://arxiv.org/pdf/2305.19979v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16516v1","updated":"2023-08-31T08:00:08Z","published":"2023-08-31T08:00:08Z","title":"Curvature-based Pooling within Graph Neural Networks","summary":" Over-squashing and over-smoothing are two critical issues, that limit the\ncapabilities of graph neural networks (GNNs). While over-smoothing eliminates\nthe differences between nodes making them indistinguishable, over-squashing\nrefers to the inability of GNNs to propagate information over long distances,\nas exponentially many node states are squashed into fixed-size representations.\nBoth phenomena share similar causes, as both are largely induced by the graph\ntopology. To mitigate these problems in graph classification tasks, we propose\nCurvPool, a novel pooling method. CurvPool exploits the notion of curvature of\na graph to adaptively identify structures responsible for both over-smoothing\nand over-squashing. By clustering nodes based on the Balanced Forman curvature,\nCurvPool constructs a graph with a more suitable structure, allowing deeper\nmodels and the combination of distant information. We compare it to other\nstate-of-the-art pooling approaches and establish its competitiveness in terms\nof classification accuracy, computational complexity, and flexibility. CurvPool\noutperforms several comparable methods across all considered tasks. The most\nconsistent results are achieved by pooling densely connected clusters using the\nsum aggregation, as this allows additional information about the size of each\npool.\n","authors":["Cedric Sanders","Andreas Roth","Thomas Liebig"],"pdf_url":"https://arxiv.org/pdf/2308.16516v1.pdf","comment":"ECMLPKDD 2023 - Workshop on Mining and Learning with Graphs"},{"id":"http://arxiv.org/abs/2308.16139v2","updated":"2023-08-31T07:26:50Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrich Ferndinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Kuestner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Loeffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Arcot Sowmya","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Andreas Nuernberger","Joao Pedrosa","Carlos Ferreira","Guilherme Aresta","Antonio Cunha","Aurelio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Roehrig","Frank Hoelzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","Andre Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hoerst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dzenan Zukic","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v2.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2305.15777v2","updated":"2023-08-31T07:20:34Z","published":"2023-05-25T06:44:43Z","title":"Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation","summary":" Medical image data are often limited due to the expensive acquisition and\nannotation process. Hence, training a deep-learning model with only raw data\ncan easily lead to overfitting. One solution to this problem is to augment the\nraw data with various transformations, improving the model's ability to\ngeneralize to new data. However, manually configuring a generic augmentation\ncombination and parameters for different datasets is non-trivial due to\ninconsistent acquisition approaches and data distributions. Therefore,\nautomatic data augmentation is proposed to learn favorable augmentation\nstrategies for different datasets while incurring large GPU overhead. To this\nend, we present a novel method, called Dynamic Data Augmentation (DDAug), which\nis efficient and has negligible computation cost. Our DDAug develops a\nhierarchical tree structure to represent various augmentations and utilizes an\nefficient Monte-Carlo tree searching algorithm to update, prune, and sample the\ntree. As a result, the augmentation pipeline can be optimized for each dataset\nautomatically. Experiments on multiple Prostate MRI datasets show that our\nmethod outperforms the current state-of-the-art data augmentation strategies.\n","authors":["Xinyue Xu","Yuhan Hsi","Haonan Wang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2305.15777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06777v3","updated":"2023-08-31T07:03:19Z","published":"2023-06-11T21:14:29Z","title":"Improving the Validity of Decision Trees as Explanations","summary":" In classification and forecasting with tabular data, one often utilizes\ntree-based models. Those can be competitive with deep neural networks on\ntabular data [cf. Grinsztajn et al., NeurIPS 2022, arXiv:2207.08815] and, under\nsome conditions, explainable. The explainability depends on the depth of the\ntree and the accuracy in each leaf of the tree. Decision trees containing\nleaves with unbalanced accuracy can provide misleading explanations.\nLow-accuracy leaves give less valid explanations, which could be interpreted as\nunfairness among explanations. Here, we train a shallow tree with the objective\nof minimizing the maximum misclassification error across each leaf node. Then,\nwe extend each leaf with a separate tree-based model. The shallow tree provides\na global explanation, while the overall statistical performance of the shallow\ntree with extended leaves improves upon decision trees of unlimited depth\ntrained using classical methods (e.g., CART) and is comparable to\nstate-of-the-art methods (e.g., well-tuned XGBoost).\n","authors":["Jiri Nemecek","Tomas Pevny","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2306.06777v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16491v1","updated":"2023-08-31T06:53:22Z","published":"2023-08-31T06:53:22Z","title":"In-class Data Analysis Replications: Teaching Students while Testing\n Science","summary":" Science is facing a reproducibility crisis. Previous work has proposed\nincorporating data analysis replications into classrooms as a potential\nsolution. However, despite the potential benefits, it is unclear whether this\napproach is feasible, and if so, what the involved stakeholders-students,\neducators, and scientists-should expect from it. Can students perform a data\nanalysis replication over the course of a class? What are the costs and\nbenefits for educators? And how can this solution help benchmark and improve\nthe state of science?\n In the present study, we incorporated data analysis replications in the\nproject component of the Applied Data Analysis course (CS-401) taught at EPFL\n(N=354 students). Here we report pre-registered findings based on surveys\nadministered throughout the course. First, we demonstrate that students can\nreplicate previously published scientific papers, most of them qualitatively\nand some exactly. We find discrepancies between what students expect of data\nanalysis replications and what they experience by doing them along with changes\nin expectations about reproducibility, which together serve as evidence of\nattitude shifts to foster students' critical thinking. Second, we provide\ninformation for educators about how much overhead is needed to incorporate\nreplications into the classroom and identify concerns that replications bring\nas compared to more traditional assignments. Third, we identify tangible\nbenefits of the in-class data analysis replications for scientific communities,\nsuch as a collection of replication reports and insights about replication\nbarriers in scientific work that should be avoided going forward.\n Overall, we demonstrate that incorporating replication tasks into a large\ndata science class can increase the reproducibility of scientific work as a\nby-product of data science instruction, thus benefiting both science and\nstudents.\n","authors":["Kristina Gligoric","Tiziano Piccardi","Jake Hofman","Robert West"],"pdf_url":"https://arxiv.org/pdf/2308.16491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16490v1","updated":"2023-08-31T06:52:43Z","published":"2023-08-31T06:52:43Z","title":"Latent Painter","summary":" Latent diffusers revolutionized the generative AI and inspired creative art.\nWhen denoising the latent, the predicted original image at each step\ncollectively animates the formation. However, the animation is limited by the\ndenoising nature of the diffuser, and only renders a sharpening process. This\nwork presents Latent Painter, which uses the latent as the canvas, and the\ndiffuser predictions as the plan, to generate painting animation. Latent\nPainter also transits one generated image to another, which can happen between\nimages from two different sets of checkpoints.\n","authors":["Shih-Chieh Su"],"pdf_url":"https://arxiv.org/pdf/2308.16490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11594v2","updated":"2023-08-31T06:48:29Z","published":"2023-08-20T05:03:31Z","title":"Quantization-based Optimization with Perspective of Quantum Mechanics","summary":" Statistical and stochastic analysis based on thermodynamics has been the main\nanalysis framework for stochastic global optimization. Recently, appearing\nquantum annealing or quantum tunneling algorithm for global optimization, we\nrequire a new researching framework for global optimization algorithms. In this\npaper, we provide the analysis for quantization-based optimization based on the\nSchr\\\"odinger equation to reveal what property in quantum mechanics enables\nglobal optimization. We present that the tunneling effect derived by the\nSchr\\\"odinger equation in quantization-based optimization enables to escape of\na local minimum. Additionally, we confirm that this tunneling effect is the\nsame property included in quantum mechanics-based global optimization.\nExperiments with standard multi-modal benchmark functions represent that the\nproposed analysis is valid.\n","authors":["Jinwuk Seok","Changsik Cho"],"pdf_url":"https://arxiv.org/pdf/2308.11594v2.pdf","comment":"Preprint for ICTC conference (First Revision)"},{"id":"http://arxiv.org/abs/2308.16484v1","updated":"2023-08-31T06:44:59Z","published":"2023-08-31T06:44:59Z","title":"Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning","summary":" Affordable 3D scanners often produce sparse and non-uniform point clouds that\nnegatively impact downstream applications in robotic systems. While existing\npoint cloud upsampling architectures have demonstrated promising results on\nstandard benchmarks, they tend to experience significant performance drops when\nthe test data have different distributions from the training data. To address\nthis issue, this paper proposes a test-time adaption approach to enhance model\ngenerality of point cloud upsampling. The proposed approach leverages\nmeta-learning to explicitly learn network parameters for test-time adaption.\nOur method does not require any prior information about the test data. During\nmeta-training, the model parameters are learned from a collection of\ninstance-level tasks, each of which consists of a sparse-dense pair of point\nclouds from the training data. During meta-testing, the trained model is\nfine-tuned with a few gradient updates to produce a unique set of network\nparameters for each test instance. The updated model is then used for the final\nprediction. Our framework is generic and can be applied in a plug-and-play\nmanner with existing backbone networks in point cloud upsampling. Extensive\nexperiments demonstrate that our approach improves the performance of\nstate-of-the-art models.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16483v1","updated":"2023-08-31T06:44:42Z","published":"2023-08-31T06:44:42Z","title":"Echocardiographic View Classification with Integrated\n Out-of-Distribution Detection for Enhanced Automatic Echocardiographic\n Analysis","summary":" In the rapidly evolving field of automatic echocardiographic analysis and\ninterpretation, automatic view classification is a critical yet challenging\ntask, owing to the inherent complexity and variability of echocardiographic\ndata. This study presents ECHOcardiography VIew Classification with\nOut-of-Distribution dEtection (ECHO-VICODE), a novel deep learning-based\nframework that effectively addresses this challenge by training to classify 31\nclasses, surpassing previous studies and demonstrating its capacity to handle a\nwide range of echocardiographic views. Furthermore, ECHO-VICODE incorporates an\nintegrated out-of-distribution (OOD) detection function, leveraging the\nrelative Mahalanobis distance to effectively identify 'near-OOD' instances\ncommonly encountered in echocardiographic data. Through extensive\nexperimentation, we demonstrated the outstanding performance of ECHO-VICODE in\nterms of view classification and OOD detection, significantly reducing the\npotential for errors in echocardiographic analyses. This pioneering study\nsignificantly advances the domain of automated echocardiography analysis and\nexhibits promising prospects for substantial applications in extensive clinical\nresearch and practice.\n","authors":["Jaeik Jeon","Seongmin Ha","Yeonyee E. Yoon","Jiyeon Kim","Hyunseok Jeong","Dawun Jeong","Yeonggul Jang","Youngtaek Hong","Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2308.16483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06620v2","updated":"2023-08-31T06:35:36Z","published":"2023-07-13T08:36:15Z","title":"Online Distributed Learning with Quantized Finite-Time Coordination","summary":" In this paper we consider online distributed learning problems. Online\ndistributed learning refers to the process of training learning models on\ndistributed data sources. In our setting a set of agents need to cooperatively\ntrain a learning model from streaming data. Differently from federated\nlearning, the proposed approach does not rely on a central server but only on\npeer-to-peer communications among the agents. This approach is often used in\nscenarios where data cannot be moved to a centralized location due to privacy,\nsecurity, or cost reasons. In order to overcome the absence of a central\nserver, we propose a distributed algorithm that relies on a quantized,\nfinite-time coordination protocol to aggregate the locally trained models.\nFurthermore, our algorithm allows for the use of stochastic gradients during\nlocal training. Stochastic gradients are computed using a randomly sampled\nsubset of the local training data, which makes the proposed algorithm more\nefficient and scalable than traditional gradient descent. In our paper, we\nanalyze the performance of the proposed algorithm in terms of the mean distance\nfrom the online solution. Finally, we present numerical results for a logistic\nregression task.\n","authors":["Nicola Bastianello","Apostolos I. Rikos","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2307.06620v2.pdf","comment":"To be presented at IEEE CDC'23"},{"id":"http://arxiv.org/abs/2308.16481v1","updated":"2023-08-31T06:32:11Z","published":"2023-08-31T06:32:11Z","title":"Point-TTA: Test-Time Adaptation for Point Cloud Registration Using\n Multitask Meta-Auxiliary Learning","summary":" We present Point-TTA, a novel test-time adaptation framework for point cloud\nregistration (PCR) that improves the generalization and the performance of\nregistration models. While learning-based approaches have achieved impressive\nprogress, generalization to unknown testing environments remains a major\nchallenge due to the variations in 3D scans. Existing methods typically train a\ngeneric model and the same trained model is applied on each instance during\ntesting. This could be sub-optimal since it is difficult for the same model to\nhandle all the variations during testing. In this paper, we propose a test-time\nadaptation approach for PCR. Our model can adapt to unseen distributions at\ntest-time without requiring any prior knowledge of the test data. Concretely,\nwe design three self-supervised auxiliary tasks that are optimized jointly with\nthe primary PCR task. Given a test instance, we adapt our model using these\nauxiliary tasks and the updated model is used to perform the inference. During\ntraining, our model is trained using a meta-auxiliary learning approach, such\nthat the adapted model via auxiliary tasks improves the accuracy of the primary\ntask. Experimental results demonstrate the effectiveness of our approach in\nimproving generalization of point cloud registration and outperforming other\nstate-of-the-art approaches.\n","authors":["Ahmed Hatem","Yiming Qian","Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11656v4","updated":"2023-08-31T06:11:13Z","published":"2022-11-21T17:15:46Z","title":"Sequential Informed Federated Unlearning: Efficient and Provable Client\n Unlearning in Federated Optimization","summary":" The aim of Machine Unlearning (MU) is to provide theoretical guarantees on\nthe removal of the contribution of a given data point from a training\nprocedure. Federated Unlearning (FU) consists in extending MU to unlearn a\ngiven client's contribution from a federated training routine. Current FU\napproaches are generally not scalable, and do not come with sound theoretical\nquantification of the effectiveness of unlearning. In this work we present\nInformed Federated Unlearning (IFU), a novel efficient and quantifiable FU\napproach. Upon unlearning request from a given client, IFU identifies the\noptimal FL iteration from which FL has to be reinitialized, with unlearning\nguarantees obtained through a randomized perturbation mechanism. The theory of\nIFU is also extended to account for sequential unlearning requests.\nExperimental results on different tasks and dataset show that IFU leads to more\nefficient unlearning procedures as compared to basic re-training and\nstate-of-the-art FU approaches.\n","authors":["Yann Fraboni","Martin Van Waerebeke","Kevin Scaman","Richard Vidal","Laetitia Kameni","Marco Lorenzi"],"pdf_url":"https://arxiv.org/pdf/2211.11656v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16471v1","updated":"2023-08-31T05:26:14Z","published":"2023-08-31T05:26:14Z","title":"A Policy Adaptation Method for Implicit Multitask Reinforcement Learning\n Problems","summary":" In dynamic motion generation tasks, including contact and collisions, small\nchanges in policy parameters can lead to extremely different returns. For\nexample, in soccer, the ball can fly in completely different directions with a\nsimilar heading motion by slightly changing the hitting position or the force\napplied to the ball or when the friction of the ball varies. However, it is\ndifficult to imagine that completely different skills are needed for heading a\nball in different directions. In this study, we proposed a multitask\nreinforcement learning algorithm for adapting a policy to implicit changes in\ngoals or environments in a single motion category with different reward\nfunctions or physical parameters of the environment. We evaluated the proposed\nmethod on the ball heading task using a monopod robot model. The results showed\nthat the proposed method can adapt to implicit changes in the goal positions or\nthe coefficients of restitution of the ball, whereas the standard domain\nrandomization approach cannot cope with different task settings.\n","authors":["Satoshi Yamamori","Jun Morimoto"],"pdf_url":"https://arxiv.org/pdf/2308.16471v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.16470v1","updated":"2023-08-31T05:26:08Z","published":"2023-08-31T05:26:08Z","title":"Domain-adaptive Message Passing Graph Neural Network","summary":" Cross-network node classification (CNNC), which aims to classify nodes in a\nlabel-deficient target network by transferring the knowledge from a source\nnetwork with abundant labels, draws increasing attention recently. To address\nCNNC, we propose a domain-adaptive message passing graph neural network\n(DM-GNN), which integrates graph neural network (GNN) with conditional\nadversarial domain adaptation. DM-GNN is capable of learning informative\nrepresentations for node classification that are also transferrable across\nnetworks. Firstly, a GNN encoder is constructed by dual feature extractors to\nseparate ego-embedding learning from neighbor-embedding learning so as to\njointly capture commonality and discrimination between connected nodes.\nSecondly, a label propagation node classifier is proposed to refine each node's\nlabel prediction by combining its own prediction and its neighbors' prediction.\nIn addition, a label-aware propagation scheme is devised for the labeled source\nnetwork to promote intra-class propagation while avoiding inter-class\npropagation, thus yielding label-discriminative source embeddings. Thirdly,\nconditional adversarial domain adaptation is performed to take the\nneighborhood-refined class-label information into account during adversarial\ndomain adaptation, so that the class-conditional distributions across networks\ncan be better matched. Comparisons with eleven state-of-the-art methods\ndemonstrate the effectiveness of the proposed DM-GNN.\n","authors":["Xiao Shen","Shirui Pan","Kup-Sze Choi","Xi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16468v1","updated":"2023-08-31T05:22:51Z","published":"2023-08-31T05:22:51Z","title":"Computing excited states of molecules using normalizing flows","summary":" We present a new nonlinear variational framework for simultaneously computing\nground and excited states of quantum systems. Our approach is based on\napproximating wavefunctions in the linear span of basis functions that are\naugmented and optimized \\emph{via} composition with normalizing flows. The\naccuracy and efficiency of our approach are demonstrated in the calculations of\na large number of vibrational states of the triatomic H$_2$S molecule as well\nas ground and several excited electronic states of prototypical one-electron\nsystems including the hydrogen atom, the molecular hydrogen ion, and a carbon\natom in a single-active-electron approximation. The results demonstrate\nsignificant improvements in the accuracy of energy predictions and accelerated\nbasis-set convergence even when using normalizing flows with a small number of\nparameters. The present approach can be also seen as the optimization of a set\nof intrinsic coordinates that best capture the underlying physics within the\ngiven basis set.\n","authors":["Yahya Saleh","Álvaro Fernández Corral","Armin Iske","Jochen Küpper","Andrey Yachmenev"],"pdf_url":"https://arxiv.org/pdf/2308.16468v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07864v2","updated":"2023-08-31T05:11:10Z","published":"2022-11-15T03:10:05Z","title":"Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning","summary":" Federated learning (FL) enables multiple clients to collaboratively train a\nglobal model without disclosing their data. Previous researches often require\ntraining the complete model parameters. However, the emergence of powerful\npre-trained models makes it possible to achieve higher performance with fewer\nlearnable parameters in FL. In this paper, we propose a federated adaptive\nprompt tuning algorithm, FedAPT, for multi-domain collaborative image\nclassification with powerful foundation models, like CLIP. Compared with direct\nfederated prompt tuning, our core idea is to adaptively unlock specific domain\nknowledge for each test sample in order to provide them with personalized\nprompts. To implement this idea, we design an adaptive prompt tuning module,\nwhich consists of a meta prompt, an adaptive network, and some keys. The server\nrandomly generates a set of keys and assigns a unique key to each client. Then\nall clients cooperatively train the global adaptive network and meta prompt\nwith the local datasets and the frozen keys. Ultimately, the global aggregation\nmodel can assign a personalized prompt to CLIP based on the domain features of\neach test sample. We perform extensive experiments on two multi-domain image\nclassification datasets across two different settings - supervised and\nunsupervised. The results show that FedAPT can achieve better performance with\nless than 10\\% of the number of parameters of the fully trained model, and the\nglobal model can perform well in diverse client domains simultaneously.\n","authors":["Shangchao Su","Mingzhao Yang","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2211.07864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16458v1","updated":"2023-08-31T04:52:58Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained language models like ChatGPT have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks. Moreover, in bioinformatics, generating\nfunctional programs poses additional notable challenges due to the amount of\ndomain knowledge, the need for complicated data operations, and intricate\nfunctional dependencies between the operations. Here, we present BioCoder, a\nbenchmark developed to evaluate existing pre-trained models in generating\nbioinformatics code. In relation to function-code generation, BioCoder covers\npotential package dependencies, class declarations, and global variables. It\nincorporates 1026 functions and 1243 methods in Python and Java from GitHub and\n253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing\nframework for evaluation, and we have applied it to evaluate many models\nincluding InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+,\nInstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes\nthe importance of domain knowledge, pragmatic code generation, and contextual\nunderstanding. Our dataset, benchmark, Docker images, and scripts required for\ntesting are all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16456v1","updated":"2023-08-31T04:48:59Z","published":"2023-08-31T04:48:59Z","title":"Least Squares Maximum and Weighted Generalization-Memorization Machines","summary":" In this paper, we propose a new way of remembering by introducing a memory\ninfluence mechanism for the least squares support vector machine (LSSVM).\nWithout changing the equation constraints of the original LSSVM, this\nmechanism, allows an accurate partitioning of the training set without\noverfitting. The maximum memory impact model (MIMM) and the weighted impact\nmemory model (WIMM) are then proposed. It is demonstrated that these models can\nbe degraded to the LSSVM. Furthermore, we propose some different memory impact\nfunctions for the MIMM and WIMM. The experimental results show that that our\nMIMM and WIMM have better generalization performance compared to the LSSVM and\nsignificant advantage in time cost compared to other memory models.\n","authors":["Shuai Wang","Zhen Wang","Yuan-Hai Shao"],"pdf_url":"https://arxiv.org/pdf/2308.16456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16454v1","updated":"2023-08-31T04:46:12Z","published":"2023-08-31T04:46:12Z","title":"Adversarial Finetuning with Latent Representation Constraint to Mitigate\n Accuracy-Robustness Tradeoff","summary":" This paper addresses the tradeoff between standard accuracy on clean examples\nand robustness against adversarial examples in deep neural networks (DNNs).\nAlthough adversarial training (AT) improves robustness, it degrades the\nstandard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we\npropose a novel AT method called ARREST, which comprises three components: (i)\nadversarial finetuning (AFT), (ii) representation-guided knowledge distillation\n(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples\nby initializing its parameters with a DNN that is standardly pretrained on\nclean examples. RGKD and NR respectively entail a regularization term and an\nalgorithm to preserve latent representations of clean examples during AFT. RGKD\npenalizes the distance between the representations of the standardly pretrained\nand AFT DNNs. NR switches input adversarial examples to nonadversarial ones\nwhen the representation changes significantly during AFT. By combining these\ncomponents, ARREST achieves both high standard accuracy and robustness.\nExperimental results demonstrate that ARREST mitigates the tradeoff more\neffectively than previous AT-based methods do.\n","authors":["Satoshi Suzuki","Shin'ya Yamaguchi","Shoichiro Takeda","Sekitoshi Kanai","Naoki Makishima","Atsushi Ando","Ryo Masumura"],"pdf_url":"https://arxiv.org/pdf/2308.16454v1.pdf","comment":"Accepted by International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.16453v1","updated":"2023-08-31T04:45:44Z","published":"2023-08-31T04:45:44Z","title":"Listen to Minority: Encrypted Traffic Classification for Class Imbalance\n with Contrastive Pre-Training","summary":" Mobile Internet has profoundly reshaped modern lifestyles in various aspects.\nEncrypted Traffic Classification (ETC) naturally plays a crucial role in\nmanaging mobile Internet, especially with the explosive growth of mobile apps\nusing encrypted communication. Despite some existing learning-based ETC methods\nshowing promising results, three-fold limitations still remain in real-world\nnetwork environments, 1) label bias caused by traffic class imbalance, 2)\ntraffic homogeneity caused by component sharing, and 3) training with reliance\non sufficient labeled traffic. None of the existing ETC methods can address all\nthese limitations. In this paper, we propose a novel Pre-trAining\nSemi-Supervised ETC framework, dubbed PASS. Our key insight is to resample the\noriginal train dataset and perform contrastive pre-training without using\nindividual app labels directly to avoid label bias issues caused by class\nimbalance, while obtaining a robust feature representation to differentiate\noverlapping homogeneous traffic by pulling positive traffic pairs closer and\npushing negative pairs away. Meanwhile, PASS designs a semi-supervised\noptimization strategy based on pseudo-label iteration and dynamic loss\nweighting algorithms in order to effectively utilize massive unlabeled traffic\ndata and alleviate manual train dataset annotation workload. PASS outperforms\nstate-of-the-art ETC methods and generic sampling approaches on four public\ndatasets with significant class imbalance and traffic homogeneity, remarkably\npushing the F1 of Cross-Platform215 with 1.31%, ISCX-17 with 9.12%.\nFurthermore, we validate the generality of the contrastive pre-training and\npseudo-label iteration components of PASS, which can adaptively benefit ETC\nmethods with diverse feature extractors.\n","authors":["Xiang Li","Juncheng Guo","Qige Song","Jiang Xie","Yafei Sang","Shuyuan Zhao","Yongzheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16453v1.pdf","comment":"Accepted by 2023 IEEE SECON, 9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.11029v2","updated":"2023-08-31T04:36:30Z","published":"2023-08-18T11:29:12Z","title":"RBA-GCN: Relational Bilevel Aggregation Graph Convolutional Network for\n Emotion Recognition","summary":" Emotion recognition in conversation (ERC) has received increasing attention\nfrom researchers due to its wide range of applications.As conversation has a\nnatural graph structure,numerous approaches used to model ERC based on graph\nconvolutional networks (GCNs) have yielded significant results.However,the\naggregation approach of traditional GCNs suffers from the node information\nredundancy problem,leading to node discriminant information\nloss.Additionally,single-layer GCNs lack the capacity to capture long-range\ncontextual information from the graph. Furthermore,the majority of approaches\nare based on textual modality or stitching together different modalities,\nresulting in a weak ability to capture interactions between modalities. To\naddress these problems, we present the relational bilevel aggregation graph\nconvolutional network (RBA-GCN), which consists of three modules: the graph\ngeneration module (GGM), similarity-based cluster building module (SCBM) and\nbilevel aggregation module (BiAM). First, GGM constructs a novel graph to\nreduce the redundancy of target node information.Then,SCBM calculates the node\nsimilarity in the target node and its structural neighborhood, where noisy\ninformation with low similarity is filtered out to preserve the discriminant\ninformation of the node. Meanwhile, BiAM is a novel aggregation method that can\npreserve the information of nodes during the aggregation process. This module\ncan construct the interaction between different modalities and capture\nlong-range contextual information based on similarity clusters. On both the\nIEMOCAP and MELD datasets, the weighted average F1 score of RBA-GCN has a\n2.17$\\sim$5.21\\% improvement over that of the most advanced method.Our code is\navailable at https://github.com/luftmenscher/RBA-GCN and our article with the\nsame name has been published in IEEE/ACM Transactions on Audio,Speech,and\nLanguage Processing,vol.31,2023\n","authors":["Lin Yuan","Guoheng Huang","Fenghuan Li","Xiaochen Yuan","Chi-Man Pun","Guo Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.11029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16437v1","updated":"2023-08-31T03:52:57Z","published":"2023-08-31T03:52:57Z","title":"AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR\n Prediction","summary":" Click-through rate (CTR) prediction is a crucial issue in recommendation\nsystems. There has been an emergence of various public CTR datasets. However,\nexisting datasets primarily suffer from the following limitations. Firstly,\nusers generally click different types of items from multiple scenarios, and\nmodeling from multiple scenarios can provide a more comprehensive understanding\nof users. Existing datasets only include data for the same type of items from a\nsingle scenario. Secondly, multi-modal features are essential in multi-scenario\nprediction as they address the issue of inconsistent ID encoding between\ndifferent scenarios. The existing datasets are based on ID features and lack\nmulti-modal features. Third, a large-scale dataset can provide a more reliable\nevaluation of models, fully reflecting the performance differences between\nmodels. The scale of existing datasets is around 100 million, which is\nrelatively small compared to the real-world CTR prediction. To address these\nlimitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset\nbased on industrial data from Alipay. Specifically, AntM$^{2}$C provides the\nfollowing advantages: 1) It covers CTR data of 5 different types of items,\nproviding insights into the preferences of users for different items, including\nadvertisements, vouchers, mini-programs, contents, and videos. 2) Apart from\nID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text\nand image features, which can effectively establish connections between items\nwith different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200\nfeatures, including 200 million users and 6 million items. It is currently the\nlargest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several\ntypical CTR tasks and provide comparisons with baseline methods. The dataset\nhomepage is available at https://www.atecup.cn/home.\n","authors":["Zhaoxin Huan","Ke Ding","Ang Li","Xiaolu Zhang","Xu Min","Yong He","Liang Zhang","Jun Zhou","Linjian Mo","Jinjie Gu","Zhongyi Liu","Wenliang Zhong","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07873v5","updated":"2023-08-31T03:47:35Z","published":"2023-07-15T19:20:49Z","title":"Why Does Little Robustness Help? Understanding and Improving Adversarial\n Transferability from Surrogate Training","summary":" Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs\nthat successfully fool white-box surrogate models can also deceive other\nblack-box models with different architectures. Although a bunch of empirical\nstudies have provided guidance on generating highly transferable AEs, many of\nthese findings lack explanations and even lead to inconsistent advice. In this\npaper, we take a further step towards understanding adversarial\ntransferability, with a particular focus on surrogate aspects. Starting from\nthe intriguing little robustness phenomenon, where models adversarially trained\nwith mildly perturbed adversarial samples can serve as better surrogates, we\nattribute it to a trade-off between two predominant factors: model smoothness\nand gradient similarity. Our investigations focus on their joint effects,\nrather than their separate correlations with transferability. Through a series\nof theoretical and empirical analyses, we conjecture that the data distribution\nshift in adversarial training explains the degradation of gradient similarity.\nBuilding on these insights, we explore the impacts of data augmentation and\ngradient regularization on transferability and identify that the trade-off\ngenerally exists in the various training mechanisms, thus building a\ncomprehensive blueprint for the regulation mechanism behind transferability.\nFinally, we provide a general route for constructing better surrogates to boost\ntransferability which optimizes both model smoothness and gradient similarity\nsimultaneously, e.g., the combination of input gradient regularization and\nsharpness-aware minimization (SAM), validated by extensive experiments. In\nsummary, we call for attention to the united impacts of these two factors for\nlaunching effective transfer attacks, rather than optimizing one while ignoring\nthe other, and emphasize the crucial role of manipulating surrogate models.\n","authors":["Yechao Zhang","Shengshan Hu","Leo Yu Zhang","Junyu Shi","Minghui Li","Xiaogeng Liu","Wei Wan","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2307.07873v5.pdf","comment":"IEEE Symposium on Security and Privacy (Oakland) 2024; Extended\n version of camera-ready"},{"id":"http://arxiv.org/abs/2308.16425v1","updated":"2023-08-31T03:28:43Z","published":"2023-08-31T03:28:43Z","title":"On the Equivalence between Implicit and Explicit Neural Networks: A\n High-dimensional Viewpoint","summary":" Implicit neural networks have demonstrated remarkable success in various\ntasks. However, there is a lack of theoretical analysis of the connections and\ndifferences between implicit and explicit networks. In this paper, we study\nhigh-dimensional implicit neural networks and provide the high dimensional\nequivalents for the corresponding conjugate kernels and neural tangent kernels.\nBuilt upon this, we establish the equivalence between implicit and explicit\nnetworks in high dimensions.\n","authors":["Zenan Ling","Zhenyu Liao","Robert C. Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.16425v1.pdf","comment":"Accepted by Workshop on High-dimensional Learning Dynamics, ICML\n 2023, Honolulu, Hawaii"},{"id":"http://arxiv.org/abs/2308.16422v1","updated":"2023-08-31T03:16:38Z","published":"2023-08-31T03:16:38Z","title":"DECODE: DilatEd COnvolutional neural network for Detecting\n Extreme-mass-ratio inspirals","summary":" The detection of Extreme Mass Ratio Inspirals (EMRIs) is intricate due to\ntheir complex waveforms, extended duration, and low signal-to-noise ratio\n(SNR), making them more challenging to be identified compared to compact binary\ncoalescences. While matched filtering-based techniques are known for their\ncomputational demands, existing deep learning-based methods primarily handle\ntime-domain data and are often constrained by data duration and SNR. In\naddition, most existing work ignores time-delay interferometry (TDI) and\napplies the long-wavelength approximation in detector response calculations,\nthus limiting their ability to handle laser frequency noise. In this study, we\nintroduce DECODE, an end-to-end model focusing on EMRI signal detection by\nsequence modeling in the frequency domain. Centered around a dilated causal\nconvolutional neural network, trained on synthetic data considering TDI-1.5\ndetector response, DECODE can efficiently process a year's worth of\nmultichannel TDI data with an SNR of around 50. We evaluate our model on 1-year\ndata with accumulated SNR ranging from 50 to 120 and achieve a true positive\nrate of 96.3% at a false positive rate of 1%, keeping an inference time of less\nthan 0.01 seconds. With the visualization of three showcased EMRI signals for\ninterpretability and generalization, DECODE exhibits strong potential for\nfuture space-based gravitational wave data analyses.\n","authors":["Tianyu Zhao","Yue Zhou","Ruijun Shi","Zhoujian Cao","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2308.16422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.01129v2","updated":"2023-08-31T03:01:32Z","published":"2021-04-02T16:10:24Z","title":"Simulation-Based Optimization of User Interfaces for Quality-Assuring\n Machine Learning Model Predictions","summary":" Quality-sensitive applications of machine learning (ML) require quality\nassurance (QA) by humans before the predictions of an ML model can be deployed.\nQA for ML (QA4ML) interfaces require users to view a large amount of data and\nperform many interactions to correct errors made by the ML model. An optimized\nuser interface (UI) can significantly reduce interaction costs. While UI\noptimization can be informed by user studies evaluating design options, this\napproach is not scalable because there are typically numerous small variations\nthat can affect the efficiency of a QA4ML interface. Hence, we propose using\nsimulation to evaluate and aid the optimization of QA4ML interfaces. In\nparticular, we focus on simulating the combined effects of human intelligence\nin initiating appropriate interaction commands and machine intelligence in\nproviding algorithmic assistance for accelerating QA4ML processes. As QA4ML is\nusually labor-intensive, we use the simulated task completion time as the\nmetric for UI optimization under different interface and algorithm setups. We\ndemonstrate the usage of this UI design method in several QA4ML applications.\n","authors":["Yu Zhang","Martijn Tennekes","Tim de Jong","Lyana Curier","Bob Coecke","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2104.01129v2.pdf","comment":"Published in ACM Transactions on Interactive Intelligent Systems"},{"id":"http://arxiv.org/abs/2206.09429v4","updated":"2023-08-31T02:46:33Z","published":"2022-06-19T15:09:23Z","title":"Extending regionalization algorithms to explore spatial process\n heterogeneity","summary":" In spatial regression models, spatial heterogeneity may be considered with\neither continuous or discrete specifications. The latter is related to\ndelineation of spatially connected regions with homogeneous relationships\nbetween variables (spatial regimes). Although various regionalization\nalgorithms have been proposed and studied in the field of spatial analytics,\nmethods to optimize spatial regimes have been largely unexplored. In this\npaper, we propose two new algorithms for spatial regime delineation, two-stage\nK-Models and Regional-K-Models. We also extend the classic Automatic Zoning\nProcedure to spatial regression context. The proposed algorithms are applied to\na series of synthetic datasets and two real-world datasets. Results indicate\nthat all three algorithms achieve superior or comparable performance to\nexisting approaches, while the two-stage K-Models algorithm largely outperforms\nexisting approaches on model fitting, region reconstruction, and coefficient\nestimation. Our work enriches the spatial analytics toolbox to explore spatial\nheterogeneous processes.\n","authors":["Hao Guo","Andre Python","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2206.09429v4.pdf","comment":"25 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.03312v5","updated":"2023-08-31T02:29:36Z","published":"2023-08-07T05:40:58Z","title":"Symmetry-Preserving Program Representations for Learning Code Semantics","summary":" Large Language Models (LLMs) have shown promise in automated program\nreasoning, a crucial aspect of many security tasks. However, existing LLM\narchitectures for code are often borrowed from other domains like natural\nlanguage processing, raising concerns about their generalization and robustness\nto unseen code. A key generalization challenge is to incorporate the knowledge\nof code semantics, including control and data flow, into the LLM architectures.\n Drawing inspiration from examples of convolution layers exploiting\ntranslation symmetry, we explore how code symmetries can enhance LLM\narchitectures for program analysis and modeling. We present a rigorous\ngroup-theoretic framework that formally defines code symmetries as\nsemantics-preserving transformations and provides techniques for precisely\nreasoning about symmetry preservation within LLM architectures. Using this\nframework, we introduce a novel variant of self-attention that preserves\nprogram symmetries, demonstrating its effectiveness in generalization and\nrobustness through detailed experimental evaluations across different binary\nand source code analysis tasks. Overall, our code symmetry framework offers\nrigorous and powerful reasoning techniques that can guide the future\ndevelopment of specialized LLMs for code and advance LLM-guided program\nreasoning tasks.\n","authors":["Kexin Pei","Weichen Li","Qirui Jin","Shuyang Liu","Scott Geng","Lorenzo Cavallaro","Junfeng Yang","Suman Jana"],"pdf_url":"https://arxiv.org/pdf/2308.03312v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07446v2","updated":"2023-08-31T02:28:41Z","published":"2023-02-15T03:32:33Z","title":"On-Demand Communication for Asynchronous Multi-Agent Bandits","summary":" This paper studies a cooperative multi-agent multi-armed stochastic bandit\nproblem where agents operate asynchronously -- agent pull times and rates are\nunknown, irregular, and heterogeneous -- and face the same instance of a\nK-armed bandit problem. Agents can share reward information to speed up the\nlearning process at additional communication costs. We propose ODC, an\non-demand communication protocol that tailors the communication of each pair of\nagents based on their empirical pull times. ODC is efficient when the pull\ntimes of agents are highly heterogeneous, and its communication complexity\ndepends on the empirical pull times of agents. ODC is a generic protocol that\ncan be integrated into most cooperative bandit algorithms without degrading\ntheir performance. We then incorporate ODC into the natural extensions of UCB\nand AAE algorithms and propose two communication-efficient cooperative\nalgorithms. Our analysis shows that both algorithms are near-optimal in regret.\n","authors":["Yu-Zhen Janice Chen","Lin Yang","Xuchuang Wang","Xutong Liu","Mohammad Hajiesmaili","John C. S. Lui","Don Towsley"],"pdf_url":"https://arxiv.org/pdf/2302.07446v2.pdf","comment":"Accepted by AISTATS 2023"},{"id":"http://arxiv.org/abs/2208.00780v5","updated":"2023-08-31T02:27:48Z","published":"2022-07-26T10:59:42Z","title":"Visual correspondence-based explanations improve AI robustness and\n human-AI team accuracy","summary":" Explaining artificial intelligence (AI) predictions is increasingly important\nand even imperative in many high-stakes applications where humans are the\nultimate decision-makers. In this work, we propose two novel architectures of\nself-interpretable image classifiers that first explain, and then predict (as\nopposed to post-hoc explanations) by harnessing the visual correspondences\nbetween a query image and exemplars. Our models consistently improve (by 1 to 4\npoints) on out-of-distribution (OOD) datasets while performing marginally worse\n(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest\nneighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB,\nour correspondence-based explanations are found to be more useful to users than\nkNN explanations. Our explanations help users more accurately reject AI's wrong\ndecisions than all other tested methods. Interestingly, for the first time, we\nshow that it is possible to achieve complementary human-AI team accuracy (i.e.,\nthat is higher than either AI-alone or human-alone), in ImageNet and CUB image\nclassification tasks.\n","authors":["Giang Nguyen","Mohammad Reza Taesiri","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2208.00780v5.pdf","comment":"NeurIPS 2022 conference paper"},{"id":"http://arxiv.org/abs/2308.15690v2","updated":"2023-08-31T02:21:20Z","published":"2023-08-30T01:14:32Z","title":"CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts","summary":" We present 'CongNaMul', a comprehensive dataset designed for various tasks in\nsoybean sprouts image analysis. The CongNaMul dataset is curated to facilitate\ntasks such as image classification, semantic segmentation, decomposition, and\nmeasurement of length and weight. The classification task provides four classes\nto determine the quality of soybean sprouts: normal, broken, spotted, and\nbroken and spotted, for the development of AI-aided automatic quality\ninspection technology. For semantic segmentation, images with varying\ncomplexity, from single sprout images to images with multiple sprouts, along\nwith human-labelled mask images, are included. The label has 4 different\nclasses: background, head, body, tail. The dataset also provides images and\nmasks for the image decomposition task, including two separate sprout images\nand their combined form. Lastly, 5 physical features of sprouts (head length,\nbody length, body thickness, tail length, weight) are provided for image-based\nmeasurement tasks. This dataset is expected to be a valuable resource for a\nwide range of research and applications in the advanced analysis of images of\nsoybean sprouts. Also, we hope that this dataset can assist researchers\nstudying classification, semantic segmentation, decomposition, and physical\nfeature measurement in other industrial fields, in evaluating their models. The\ndataset is available at the authors' repository. (https://bhban.kr/data)\n","authors":["Byunghyun Ban","Donghun Ryu","Su-won Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.15690v2.pdf","comment":"Accepted to International Conference on ICT Convergence 2023"},{"id":"http://arxiv.org/abs/2308.16406v1","updated":"2023-08-31T02:20:25Z","published":"2023-08-31T02:20:25Z","title":"CktGNN: Circuit Graph Neural Network for Electronic Design Automation","summary":" The electronic design automation of analog circuits has been a longstanding\nchallenge in the integrated circuit field due to the huge design space and\ncomplex design trade-offs among circuit specifications. In the past decades,\nintensive research efforts have mostly been paid to automate the transistor\nsizing with a given circuit topology. By recognizing the graph nature of\ncircuits, this paper presents a Circuit Graph Neural Network (CktGNN) that\nsimultaneously automates the circuit topology generation and device sizing\nbased on the encoder-dependent optimization subroutines. Particularly, CktGNN\nencodes circuit graphs using a two-level GNN framework (of nested GNN) where\ncircuits are represented as combinations of subgraphs in a known subgraph\nbasis. In this way, it significantly improves design efficiency by reducing the\nnumber of subgraphs to perform message passing. Nonetheless, another critical\nroadblock to advancing learning-assisted circuit design automation is a lack of\npublic benchmarks to perform canonical assessment and reproducible research. To\ntackle the challenge, we introduce Open Circuit Benchmark (OCB), an\nopen-sourced dataset that contains $10$K distinct operational amplifiers with\ncarefully-extracted circuit specifications. OCB is also equipped with\ncommunicative circuit generation and evaluation capabilities such that it can\nhelp to generalize CktGNN to design various analog circuits by producing\ncorresponding datasets. Experiments on OCB show the extraordinary advantages of\nCktGNN through representation-based optimization frameworks over other recent\npowerful GNN baselines and human experts' manual designs. Our work paves the\nway toward a learning-based open-sourced design automation for analog circuits.\nOur source code is available at \\url{https://github.com/zehao-dong/CktGNN}.\n","authors":["Zehao Dong","Weidong Cao","Muhan Zhang","Dacheng Tao","Yixin Chen","Xuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16406v1.pdf","comment":"Accepted by ICLR (International Conference on Learning\n Representations) 2023"},{"id":"http://arxiv.org/abs/2308.16403v1","updated":"2023-08-31T02:12:46Z","published":"2023-08-31T02:12:46Z","title":"Balancing between the Local and Global Structures (LGS) in Graph\n Embedding","summary":" We present a method for balancing between the Local and Global Structures\n(LGS) in graph embedding, via a tunable parameter. Some embedding methods aim\nto capture global structures, while others attempt to preserve local\nneighborhoods. Few methods attempt to do both, and it is not always possible to\ncapture well both local and global information in two dimensions, which is\nwhere most graph drawing live. The choice of using a local or a global\nembedding for visualization depends not only on the task but also on the\nstructure of the underlying data, which may not be known in advance. For a\ngiven graph, LGS aims to find a good balance between the local and global\nstructure to preserve. We evaluate the performance of LGS with synthetic and\nreal-world datasets and our results indicate that it is competitive with the\nstate-of-the-art methods, using established quality metrics such as stress and\nneighborhood preservation. We introduce a novel quality metric, cluster\ndistance preservation, to assess intermediate structure capture. All\nsource-code, datasets, experiments and analysis are available online.\n","authors":["Jacob Miller","Vahan Huroyan","Stephen Kobourov"],"pdf_url":"https://arxiv.org/pdf/2308.16403v1.pdf","comment":"Appears in the Proceedings of the 31st International Symposium on\n Graph Drawing and Network Visualization (GD 2023)"},{"id":"http://arxiv.org/abs/2305.11304v2","updated":"2023-08-31T02:10:40Z","published":"2023-05-16T07:00:57Z","title":"pTSE: A Multi-model Ensemble Method for Probabilistic Time Series\n Forecasting","summary":" Various probabilistic time series forecasting models have sprung up and shown\nremarkably good performance. However, the choice of model highly relies on the\ncharacteristics of the input time series and the fixed distribution that the\nmodel is based on. Due to the fact that the probability distributions cannot be\naveraged over different models straightforwardly, the current time series model\nensemble methods cannot be directly applied to improve the robustness and\naccuracy of forecasting. To address this issue, we propose pTSE, a multi-model\ndistribution ensemble method for probabilistic forecasting based on Hidden\nMarkov Model (HMM). pTSE only takes off-the-shelf outputs from member models\nwithout requiring further information about each model. Besides, we provide a\ncomplete theoretical analysis of pTSE to prove that the empirical distribution\nof time series subject to an HMM will converge to the stationary distribution\nalmost surely. Experiments on benchmarks show the superiority of pTSE overall\nmember models and competitive ensemble methods.\n","authors":["Yunyi Zhou","Zhixuan Chu","Yijia Ruan","Ge Jin","Yuchen Huang","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2305.11304v2.pdf","comment":"The 32nd International Joint Conference on Artificial Intelligence\n (IJCAI 2023)"},{"id":"http://arxiv.org/abs/2308.13570v2","updated":"2023-08-31T02:01:13Z","published":"2023-08-25T05:52:41Z","title":"Stochastic Configuration Machines for Industrial Artificial Intelligence","summary":" Real-time predictive modelling with desired accuracy is highly expected in\nindustrial artificial intelligence (IAI), where neural networks play a key\nrole. Neural networks in IAI require powerful, high-performance computing\ndevices to operate a large number of floating point data. Based on stochastic\nconfiguration networks (SCNs), this paper proposes a new randomized learner\nmodel, termed stochastic configuration machines (SCMs), to stress effective\nmodelling and data size saving that are useful and valuable for industrial\napplications. Compared to SCNs and random vector functional-link (RVFL) nets\nwith binarized implementation, the model storage of SCMs can be significantly\ncompressed while retaining favourable prediction performance. Besides the\narchitecture of the SCM learner model and its learning algorithm, as an\nimportant part of this contribution, we also provide a theoretical basis on the\nlearning capacity of SCMs by analysing the model's complexity. Experimental\nstudies are carried out over some benchmark datasets and three industrial\napplications. The results demonstrate that SCM has great potential for dealing\nwith industrial data analytics.\n","authors":["Dianhui Wang","Matthew J. Felicetti"],"pdf_url":"https://arxiv.org/pdf/2308.13570v2.pdf","comment":"23 pages, 7 figures, 12 tables"},{"id":"http://arxiv.org/abs/2308.16391v1","updated":"2023-08-31T01:54:31Z","published":"2023-08-31T01:54:31Z","title":"Improving Robustness and Accuracy of Ponzi Scheme Detection on Ethereum\n Using Time-Dependent Features","summary":" The rapid development of blockchain has led to more and more funding pouring\ninto the cryptocurrency market, which also attracted cybercriminals' interest\nin recent years. The Ponzi scheme, an old-fashioned fraud, is now popular on\nthe blockchain, causing considerable financial losses to many crypto-investors.\nA few Ponzi detection methods have been proposed in the literature, most of\nwhich detect a Ponzi scheme based on its smart contract source code or opcode.\nThe contract-code-based approach, while achieving very high accuracy, is not\nrobust: first, the source codes of a majority of contracts on Ethereum are not\navailable, and second, a Ponzi developer can fool a contract-code-based\ndetection model by obfuscating the opcode or inventing a new profit\ndistribution logic that cannot be detected (since these models were trained on\nexisting Ponzi logics only). A transaction-based approach could improve the\nrobustness of detection because transactions, unlike smart contracts, are\nharder to be manipulated. However, the current transaction-based detection\nmodels achieve fairly low accuracy. We address this gap in the literature by\ndeveloping new detection models that rely only on the transactions, hence\nguaranteeing the robustness, and moreover, achieve considerably higher\nAccuracy, Precision, Recall, and F1-score than existing transaction-based\nmodels. This is made possible thanks to the introduction of novel\ntime-dependent features that capture Ponzi behaviours characteristics derived\nfrom our comprehensive data analyses on Ponzi and non-Ponzi data from the\nXBlock-ETH repository\n","authors":["Phuong Duy Huynh","Son Hoang Dau","Xiaodong Li","Phuc Luong","Emanuele Viterbo"],"pdf_url":"https://arxiv.org/pdf/2308.16391v1.pdf","comment":"17 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2201.12994v4","updated":"2023-08-31T01:38:14Z","published":"2022-01-31T04:15:42Z","title":"MGNN: Graph Neural Networks Inspired by Distance Geometry Problem","summary":" Graph Neural Networks (GNNs) have emerged as a prominent research topic in\nthe field of machine learning. Existing GNN models are commonly categorized\ninto two types: spectral GNNs, which are designed based on polynomial graph\nfilters, and spatial GNNs, which utilize a message-passing scheme as the\nfoundation of the model. For the expressive power and universality of spectral\nGNNs, a natural approach is to improve the design of basis functions for better\napproximation ability. As for spatial GNNs, models like Graph Isomorphism\nNetworks (GIN) analyze their expressive power based on Graph Isomorphism Tests.\nRecently, there have been attempts to establish connections between spatial\nGNNs and geometric concepts like curvature and cellular sheaves, as well as\nphysical phenomena like oscillators. However, despite the recent progress,\nthere is still a lack of comprehensive analysis regarding the universality of\nspatial GNNs from the perspectives of geometry and physics. In this paper, we\npropose MetricGNN (MGNN), a spatial GNN model inspired by the\ncongruent-insensitivity property of classifiers in the classification phase of\nGNNs. We demonstrate that a GNN model is universal in the spatial domain if it\ncan generate embedding matrices that are congruent to any given embedding\nmatrix. This property is closely related to the Distance Geometry Problem\n(DGP). Since DGP is an NP-Hard combinatorial optimization problem, we propose\noptimizing an energy function derived from spring networks and the\nMulti-Dimensional Scaling (MDS) problem. This approach also allows our model to\nhandle both homophilic and heterophilic graphs. Finally, we propose employing\nthe iteration method to optimize our energy function. We extensively evaluate\nthe effectiveness of our model through experiments conducted on both synthetic\nand real-world datasets. Our code is available at:\nhttps://github.com/GuanyuCui/MGNN.\n","authors":["Guanyu Cui","Zhewei Wei"],"pdf_url":"https://arxiv.org/pdf/2201.12994v4.pdf","comment":"Accepted by KDD 2023"},{"id":"http://arxiv.org/abs/2302.12977v3","updated":"2023-08-31T01:28:35Z","published":"2023-02-25T04:12:30Z","title":"Fair Attribute Completion on Graph with Missing Attributes","summary":" Tackling unfairness in graph learning models is a challenging task, as the\nunfairness issues on graphs involve both attributes and topological structures.\nExisting work on fair graph learning simply assumes that attributes of all\nnodes are available for model training and then makes fair predictions. In\npractice, however, the attributes of some nodes might not be accessible due to\nmissing data or privacy concerns, which makes fair graph learning even more\nchallenging. In this paper, we propose FairAC, a fair attribute completion\nmethod, to complement missing information and learn fair node embeddings for\ngraphs with missing attributes. FairAC adopts an attention mechanism to deal\nwith the attribute missing problem and meanwhile, it mitigates two types of\nunfairness, i.e., feature unfairness from attributes and topological unfairness\ndue to attribute completion. FairAC can work on various types of homogeneous\ngraphs and generate fair embeddings for them and thus can be applied to most\ndownstream tasks to improve their fairness performance. To our best knowledge,\nFairAC is the first method that jointly addresses the graph attribution\ncompletion and graph unfairness problems. Experimental results on benchmark\ndatasets show that our method achieves better fairness performance with less\nsacrifice in accuracy, compared with the state-of-the-art methods of fair graph\nlearning. Code is available at: https://github.com/donglgcn/FairAC.\n","authors":["Dongliang Guo","Zhixuan Chu","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2302.12977v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15449v2","updated":"2023-08-31T01:11:10Z","published":"2023-02-23T23:51:44Z","title":"Backpropagation through Back Substitution with a Backslash","summary":" We present a linear algebra formulation of backpropagation which allows the\ncalculation of gradients by using a generically written ``backslash'' or\nGaussian elimination on triangular systems of equations. Generally, the matrix\nelements are operators. This paper has three contributions: (i) it is of\nintellectual value to replace traditional treatments of automatic\ndifferentiation with a (left acting) operator theoretic, graph-based approach;\n(ii) operators can be readily placed in matrices in software in programming\nlanguages such as Julia as an implementation option; (iii) we introduce a novel\nnotation, ``transpose dot'' operator ``$\\{\\}^{T_\\bullet}$'' that allows for the\nreversal of operators.\n We further demonstrate the elegance of the operators approach in a suitable\nprogramming language consisting of generic linear algebra operators such as\nJulia \\cite{bezanson2017julia}, and that it is possible to realize this\nabstraction in code. Our implementation shows how generic linear algebra can\nallow operators as elements of matrices. In contrast to ``operator\noverloading,'' where backslash would normally have to be rewritten to take\nadvantage of operators, with ``generic programming'' there is no such need.\n","authors":["Alan Edelman","Ekin Akyurek","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2303.15449v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2308.16385v1","updated":"2023-08-31T01:03:27Z","published":"2023-08-31T01:03:27Z","title":"BenchTemp: A General Benchmark for Evaluating Temporal Graph Neural\n Networks","summary":" To handle graphs in which features or connectivities are evolving over time,\na series of temporal graph neural networks (TGNNs) have been proposed. Despite\nthe success of these TGNNs, the previous TGNN evaluations reveal several\nlimitations regarding four critical issues: 1) inconsistent datasets, 2)\ninconsistent evaluation pipelines, 3) lacking workload diversity, and 4)\nlacking efficient comparison. Overall, there lacks an empirical study that puts\nTGNN models onto the same ground and compares them comprehensively. To this\nend, we propose BenchTemp, a general benchmark for evaluating TGNN models on\nvarious workloads. BenchTemp provides a set of benchmark datasets so that\ndifferent TGNN models can be fairly compared. Further, BenchTemp engineers a\nstandard pipeline that unifies the TGNN evaluation. With BenchTemp, we\nextensively compare the representative TGNN models on different tasks (e.g.,\nlink prediction and node classification) and settings (transductive and\ninductive), w.r.t. both effectiveness and efficiency metrics. We have made\nBenchTemp publicly available at https://github.com/qianghuangwhu/benchtemp.\n","authors":["Qiang Huang","Jiawei Jiang","Xi Susie Rao","Ce Zhang","Zhichao Han","Zitao Zhang","Xin Wang","Yongjun He","Quanqing Xu","Yang Zhao","Chuang Hu","Shuo Shang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2308.16385v1.pdf","comment":"28 pages, 23 figures, 27 tables. Submitted to the Conference on\n Neural Information Processing Systems 2023 Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2308.16379v1","updated":"2023-08-31T00:47:58Z","published":"2023-08-31T00:47:58Z","title":"Multi-Objective Decision Transformers for Offline Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) is structured to derive policies from\nstatic trajectory data without requiring real-time environment interactions.\nRecent studies have shown the feasibility of framing offline RL as a sequence\nmodeling task, where the sole aim is to predict actions based on prior context\nusing the transformer architecture. However, the limitation of this single task\nlearning approach is its potential to undermine the transformer model's\nattention mechanism, which should ideally allocate varying attention weights\nacross different tokens in the input context for optimal prediction. To address\nthis, we reformulate offline RL as a multi-objective optimization problem,\nwhere the prediction is extended to states and returns. We also highlight a\npotential flaw in the trajectory representation used for sequence modeling,\nwhich could generate inaccuracies when modeling the state and return\ndistributions. This is due to the non-smoothness of the action distribution\nwithin the trajectory dictated by the behavioral policy. To mitigate this\nissue, we introduce action space regions to the trajectory representation. Our\nexperiments on D4RL benchmark locomotion tasks reveal that our propositions\nallow for more effective utilization of the attention mechanism in the\ntransformer model, resulting in performance that either matches or outperforms\ncurrent state-of-the art methods.\n","authors":["Abdelghani Ghanem","Philippe Ciblat","Mounir Ghogho"],"pdf_url":"https://arxiv.org/pdf/2308.16379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16375v1","updated":"2023-08-31T00:31:08Z","published":"2023-08-31T00:31:08Z","title":"A Survey on Privacy in Graph Neural Networks: Attacks, Preservation, and\n Applications","summary":" Graph Neural Networks (GNNs) have gained significant attention owing to their\nability to handle graph-structured data and the improvement in practical\napplications. However, many of these models prioritize high utility\nperformance, such as accuracy, with a lack of privacy consideration, which is a\nmajor concern in modern society where privacy attacks are rampant. To address\nthis issue, researchers have started to develop privacy-preserving GNNs.\nDespite this progress, there is a lack of a comprehensive overview of the\nattacks and the techniques for preserving privacy in the graph domain. In this\nsurvey, we aim to address this gap by summarizing the attacks on graph data\naccording to the targeted information, categorizing the privacy preservation\ntechniques in GNNs, and reviewing the datasets and applications that could be\nused for analyzing/solving privacy issues in GNNs. We also outline potential\ndirections for future research in order to build better privacy-preserving\nGNNs.\n","authors":["Yi Zhang","Yuying Zhao","Zhaoqing Li","Xueqi Cheng","Yu Wang","Olivera Kotevska","Philip S. Yu","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.16375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.04348v3","updated":"2023-08-31T00:23:22Z","published":"2022-04-09T01:48:41Z","title":"Neuronal diversity can improve machine learning for physics and beyond","summary":" Diversity conveys advantages in nature, yet homogeneous neurons typically\ncomprise the layers of artificial neural networks. Here we construct neural\nnetworks from neurons that learn their own activation functions, quickly\ndiversify, and subsequently outperform their homogeneous counterparts on image\nclassification and nonlinear regression tasks. Sub-networks instantiate the\nneurons, which meta-learn especially efficient sets of nonlinear responses.\nExamples include conventional neural networks classifying digits and\nforecasting a van der Pol oscillator and physics-informed Hamiltonian neural\nnetworks learning H\\'enon-Heiles stellar orbits and the swing of a video\nrecorded pendulum clock. Such \\textit{learned diversity} provides examples of\ndynamical systems selecting diversity over uniformity and elucidates the role\nof diversity in natural and artificial systems.\n","authors":["Anshul Choudhary","Anil Radhakrishnan","John F. Lindner","Sudeshna Sinha","William L. Ditto"],"pdf_url":"https://arxiv.org/pdf/2204.04348v3.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2102.04307v3","updated":"2023-08-31T00:20:06Z","published":"2021-02-08T16:10:50Z","title":"Learning Optimal Strategies for Temporal Tasks in Stochastic Games","summary":" Synthesis from linear temporal logic (LTL) specifications provides assured\ncontrollers for systems operating in stochastic and potentially adversarial\nenvironments. Automatic synthesis tools, however, require a model of the\nenvironment to construct controllers. In this work, we introduce a model-free\nreinforcement learning (RL) approach to derive controllers from given LTL\nspecifications even when the environment is completely unknown. We model the\nproblem as a stochastic game (SG) between the controller and the adversarial\nenvironment; we then learn optimal control strategies that maximize the\nprobability of satisfying the LTL specifications against the worst-case\nenvironment behavior. We first construct a product game using the deterministic\nparity automaton (DPA) translated from the given LTL specification. By deriving\ndistinct rewards and discount factors from the acceptance condition of the DPA,\nwe reduce the maximization of the worst-case probability of satisfying the LTL\nspecification into the maximization of a discounted reward objective in the\nproduct game; this enables the use of model-free RL algorithms to learn an\noptimal controller strategy. To deal with the common scalability problems when\nthe number of sets defining the acceptance condition of the DPA (usually\nreferred as colors), is large, we propose a lazy color generation method where\ndistinct rewards and discount factors are utilized only when needed, and an\napproximate method where the controller eventually focuses on only one color.\nIn several case studies, we show that our approach is scalable to a wide range\nof LTL formulas, significantly outperforming existing methods for learning\ncontrollers from LTL specifications in SGs.\n","authors":["Alper Kamil Bozkurt","Yu Wang","Michael M. Zavlanos","Miroslav Pajic"],"pdf_url":"https://arxiv.org/pdf/2102.04307v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16369v1","updated":"2023-08-31T00:03:02Z","published":"2023-08-31T00:03:02Z","title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked\n Prefills","summary":" Large Language Model (LLM) inference consists of two distinct phases -\nprefill phase which processes the input prompt and decode phase which generates\noutput tokens autoregressively. While the prefill phase effectively saturates\nGPU compute at small batch sizes, the decode phase results in low compute\nutilization as it generates one token at a time per request. The varying\nprefill and decode times also lead to imbalance across micro-batches when using\npipeline parallelism, resulting in further inefficiency due to bubbles.\n We present SARATHI to address these challenges. SARATHI employs\nchunked-prefills, which splits a prefill request into equal sized chunks, and\ndecode-maximal batching, which constructs a batch using a single prefill chunk\nand populates the remaining slots with decodes. During inference, the prefill\nchunk saturates GPU compute, while the decode requests 'piggyback' and cost up\nto an order of magnitude less compared to a decode-only batch. Chunked-prefills\nallows constructing multiple decode-maximal batches from a single prefill\nrequest, maximizing coverage of decodes that can piggyback. Furthermore, the\nuniform compute design of these batches ameliorates the imbalance between\nmicro-batches, significantly reducing pipeline bubbles.\n Our techniques yield significant improvements in inference performance across\nmodels and hardware. For the LLaMA-13B model on A6000 GPU, SARATHI improves\ndecode throughput by up to 10x, and accelerates end-to-end throughput by up to\n1.33x. For LLaMa-33B on A100 GPU, we achieve 1.25x higher end-to-end-throughput\nand up to 4.25x higher decode throughput. When used with pipeline parallelism\non GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end\nthroughput improvement of 1.91x.\n","authors":["Amey Agrawal","Ashish Panwar","Jayashree Mohan","Nipun Kwatra","Bhargav S. Gulavani","Ramachandran Ramjee"],"pdf_url":"https://arxiv.org/pdf/2308.16369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05831v3","updated":"2023-08-31T23:57:18Z","published":"2022-05-12T01:54:22Z","title":"Feature Extractor Stacking for Cross-domain Few-shot Meta-learning","summary":" Cross-domain few-shot meta-learning (CDFSML) addresses learning problems\nwhere knowledge needs to be transferred from several source domains into an\ninstance-scarce target domain with an explicitly different distribution.\nRecently published CDFSML methods generally construct a universal model that\ncombines knowledge of multiple source domains into one backbone feature\nextractor. This enables efficient inference but necessitates re-computation of\nthe backbone whenever a new source domain is added. Some of these methods are\nalso incompatible with heterogeneous source domain backbone architectures. We\npropose feature extractor stacking (FES), a new CDFSML method for combining\ninformation from a collection of backbones, which can utilise heterogeneous\npretrained backbones out of the box, and does not maintain a universal model\nthat needs to be re-computed when its backbone collection is updated. We\npresent the basic FES algorithm, which is inspired by the classic stacking\napproach to meta-learning, and also introduce two variants: convolutional FES\n(ConFES) and regularised FES (ReFES). Given a target-domain task, these\nalgorithms fine-tune each backbone independently, use cross-validation to\nextract meta training data from the support set, and learn a simple linear\nmeta-classifier from this data. We evaluate our FES methods on the well-known\nMeta-Dataset benchmark, targeting image classification with convolutional\nneural networks, and show that they can achieve state-of-the-art performance.\n","authors":["Hongyu Wang","Eibe Frank","Bernhard Pfahringer","Michael Mayo","Geoffrey Holmes"],"pdf_url":"https://arxiv.org/pdf/2205.05831v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00169v1","updated":"2023-08-31T23:26:10Z","published":"2023-08-31T23:26:10Z","title":"RepCodec: A Speech Representation Codec for Speech Tokenization","summary":" With recent rapid growth of large language models (LLMs), discrete speech\ntokenization has played an important role for injecting speech into LLMs.\nHowever, this discretization gives rise to a loss of information, consequently\nimpairing overall performance. To improve the performance of these discrete\nspeech tokens, we present RepCodec, a novel speech representation codec for\nsemantic speech tokenization. In contrast to audio codecs which reconstruct the\nraw audio, RepCodec learns a vector quantization codebook through\nreconstructing speech representations from speech encoders like HuBERT or\ndata2vec. Together, the speech encoder, the codec encoder and the vector\nquantization codebook form a pipeline for converting speech waveforms into\nsemantic tokens. The extensive experiments illustrate that RepCodec, by virtue\nof its enhanced information retention capacity, significantly outperforms the\nwidely used k-means clustering approach in both speech understanding and\ngeneration. Furthermore, this superiority extends across various speech\nencoders and languages, affirming the robustness of RepCodec. We believe our\nmethod can facilitate large language modeling research on speech processing.\n","authors":["Zhichao Huang","Chutong Meng","Tom Ko"],"pdf_url":"https://arxiv.org/pdf/2309.00169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11277v2","updated":"2023-08-31T22:56:43Z","published":"2023-03-20T17:12:42Z","title":"Model Stitching: Looking For Functional Similarity Between\n Representations","summary":" Model stitching (Lenc & Vedaldi 2015) is a compelling methodology to compare\ndifferent neural network representations, because it allows us to measure to\nwhat degree they may be interchanged. We expand on a previous work from Bansal,\nNakkiran & Barak which used model stitching to compare representations of the\nsame shapes learned by differently seeded and/or trained neural networks of the\nsame architecture. Our contribution enables us to compare the representations\nlearned by layers with different shapes from neural networks with different\narchitectures. We subsequently reveal unexpected behavior of model stitching.\nNamely, we find that stitching, based on convolutions, for small ResNets, can\nreach high accuracy if those layers come later in the first (sender) network\nthan in the second (receiver), even if those layers are far apart.\n","authors":["Adriano Hernandez","Rumen Dangovski","Peter Y. Lu","Marin Soljacic"],"pdf_url":"https://arxiv.org/pdf/2303.11277v2.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2303.00848v6","updated":"2023-08-31T22:50:17Z","published":"2023-03-01T22:36:05Z","title":"VDM++: Variational Diffusion Models for High-Quality Synthesis","summary":" To achieve the highest perceptual quality, state-of-the-art diffusion models\nare optimized with objectives that typically look very different from the\nmaximum likelihood and the Evidence Lower Bound (ELBO) objectives. In this\nwork, we reveal that diffusion model objectives are actually closely related to\nthe ELBO.\n Specifically, we show that all commonly used diffusion model objectives\nequate to a weighted integral of ELBOs over different noise levels, where the\nweighting depends on the specific objective used. Under the condition of\nmonotonic weighting, the connection is even closer: the diffusion objective\nthen equals the ELBO, combined with simple data augmentation, namely Gaussian\nnoise perturbation. We show that this condition holds for a number of\nstate-of-the-art diffusion models.\n In experiments, we explore new monotonic weightings and demonstrate their\neffectiveness, achieving state-of-the-art FID scores on the high-resolution\nImageNet benchmark.\n","authors":["Diederik P. Kingma","Ruiqi Gao"],"pdf_url":"https://arxiv.org/pdf/2303.00848v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12740v2","updated":"2023-08-31T22:21:19Z","published":"2023-08-24T12:42:00Z","title":"Human Comprehensible Active Learning of Genome-Scale Metabolic Networks","summary":" An important application of Synthetic Biology is the engineering of the host\ncell system to yield useful products. However, an increase in the scale of the\nhost system leads to huge design space and requires a large number of\nvalidation trials with high experimental costs. A comprehensible machine\nlearning approach that efficiently explores the hypothesis space and guides\nexperimental design is urgently needed for the Design-Build-Test-Learn (DBTL)\ncycle of the host cell system. We introduce a novel machine learning framework\nILP-iML1515 based on Inductive Logic Programming (ILP) that performs abductive\nlogical reasoning and actively learns from training examples. In contrast to\nnumerical models, ILP-iML1515 is built on comprehensible logical\nrepresentations of a genome-scale metabolic model and can update the model by\nlearning new logical structures from auxotrophic mutant trials. The ILP-iML1515\nframework 1) allows high-throughput simulations and 2) actively selects\nexperiments that reduce the experimental cost of learning gene functions in\ncomparison to randomly selected experiments.\n","authors":["Lun Ai","Shi-Shun Liang","Wang-Zhou Dai","Liam Hallett","Stephen H. Muggleton","Geoff S. Baldwin"],"pdf_url":"https://arxiv.org/pdf/2308.12740v2.pdf","comment":"Invited presentation for AAAI Spring Symposium Series 2023 on\n Computational Scientific Discovery"},{"id":"http://arxiv.org/abs/2308.14919v2","updated":"2023-08-31T22:16:43Z","published":"2023-08-28T22:29:16Z","title":"On Reward Structures of Markov Decision Processes","summary":" A Markov decision process can be parameterized by a transition kernel and a\nreward function. Both play essential roles in the study of reinforcement\nlearning as evidenced by their presence in the Bellman equations. In our\ninquiry of various kinds of \"costs\" associated with reinforcement learning\ninspired by the demands in robotic applications, rewards are central to\nunderstanding the structure of a Markov decision process and reward-centric\nnotions can elucidate important concepts in reinforcement learning.\n Specifically, we study the sample complexity of policy evaluation and develop\na novel estimator with an instance-specific error bound of\n$\\tilde{O}(\\sqrt{\\frac{\\tau_s}{n}})$ for estimating a single state value. Under\nthe online regret minimization setting, we refine the transition-based MDP\nconstant, diameter, into a reward-based constant, maximum expected hitting\ncost, and with it, provide a theoretical explanation for how a well-known\ntechnique, potential-based reward shaping, could accelerate learning with\nexpert knowledge. In an attempt to study safe reinforcement learning, we model\nhazardous environments with irrecoverability and proposed a quantitative notion\nof safe learning via reset efficiency. In this setting, we modify a classic\nalgorithm to account for resets achieving promising preliminary numerical\nresults. Lastly, for MDPs with multiple reward functions, we develop a planning\nalgorithm that computationally efficiently finds Pareto-optimal stochastic\npolicies.\n","authors":["Falcon Z. Dai"],"pdf_url":"https://arxiv.org/pdf/2308.14919v2.pdf","comment":"This PhD thesis draws heavily from arXiv:1907.02114 and\n arXiv:2002.06299; minor edits"},{"id":"http://arxiv.org/abs/2204.04797v2","updated":"2023-08-31T22:16:42Z","published":"2022-04-10T23:30:07Z","title":"Multi-Label Clinical Time-Series Generation via Conditional GAN","summary":" In recent years, deep learning has been successfully adopted in a wide range\nof applications related to electronic health records (EHRs) such as\nrepresentation learning and clinical event prediction. However, due to privacy\nconstraints, limited access to EHR becomes a bottleneck for deep learning\nresearch. To mitigate these concerns, generative adversarial networks (GANs)\nhave been successfully used for generating EHR data. However, there are still\nchallenges in high-quality EHR generation, including generating time-series EHR\ndata and imbalanced uncommon diseases. In this work, we propose a Multi-label\nTime-series GAN (MTGAN) to generate EHR and simultaneously improve the quality\nof uncommon disease generation. The generator of MTGAN uses a gated recurrent\nunit (GRU) with a smooth conditional matrix to generate sequences and uncommon\ndiseases. The critic gives scores using Wasserstein distance to recognize real\nsamples from synthetic samples by considering both data and temporal features.\nWe also propose a training strategy to calculate temporal features for real\ndata and stabilize GAN training. Furthermore, we design multiple statistical\nmetrics and prediction tasks to evaluate the generated data. Experimental\nresults demonstrate the quality of the synthetic data and the effectiveness of\nMTGAN in generating realistic sequential EHR data, especially for uncommon\ndiseases.\n","authors":["Chang Lu","Chandan K. Reddy","Ping Wang","Dong Nie","Yue Ning"],"pdf_url":"https://arxiv.org/pdf/2204.04797v2.pdf","comment":"\\c{opyright}2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2309.00157v1","updated":"2023-08-31T22:08:01Z","published":"2023-08-31T22:08:01Z","title":"Information Fusion for Assistance Systems in Production Assessment","summary":" We propose a novel methodology to define assistance systems that rely on\ninformation fusion to combine different sources of information while providing\nan assessment. The main contribution of this paper is providing a general\nframework for the fusion of n number of information sources using the evidence\ntheory. The fusion provides a more robust prediction and an associated\nuncertainty that can be used to assess the prediction likeliness. Moreover, we\nprovide a methodology for the information fusion of two primary sources: an\nensemble classifier based on machine data and an expert-centered model. We\ndemonstrate the information fusion approach using data from an industrial\nsetup, which rounds up the application part of this research. Furthermore, we\naddress the problem of data drift by proposing a methodology to update the\ndata-based models using an evidence theory approach. We validate the approach\nusing the Benchmark Tennessee Eastman while doing an ablation study of the\nmodel update parameters.\n","authors":["Fernando Arévalo","Christian Alison M. Piolo","M. Tahasanul Ibrahim","Andreas Schwung"],"pdf_url":"https://arxiv.org/pdf/2309.00157v1.pdf","comment":"21 Pages, 10 Figures"},{"id":"http://arxiv.org/abs/2309.00149v1","updated":"2023-08-31T21:50:23Z","published":"2023-08-31T21:50:23Z","title":"TurboGP: A flexible and advanced python based GP library","summary":" We introduce TurboGP, a Genetic Programming (GP) library fully written in\nPython and specifically designed for machine learning tasks. TurboGP implements\nmodern features not available in other GP implementations, such as island and\ncellular population schemes, different types of genetic operations (migration,\nprotected crossovers), online learning, among other features. TurboGP's most\ndistinctive characteristic is its native support for different types of GP\nnodes to allow different abstraction levels, this makes TurboGP particularly\nuseful for processing a wide variety of data sources.\n","authors":["Lino Rodriguez-Coayahuitl","Alicia Morales-Reyes","Hugo Jair Escalante"],"pdf_url":"https://arxiv.org/pdf/2309.00149v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00144v1","updated":"2023-08-31T21:30:25Z","published":"2023-08-31T21:30:25Z","title":"Multi Agent DeepRL based Joint Power and Subchannel Allocation in IAB\n networks","summary":" Integrated Access and Backhauling (IAB) is a viable approach for meeting the\nunprecedented need for higher data rates of future generations, acting as a\ncost-effective alternative to dense fiber-wired links. The design of such\nnetworks with constraints usually results in an optimization problem of\nnon-convex and combinatorial nature. Under those situations, it is challenging\nto obtain an optimal strategy for the joint Subchannel Allocation and Power\nAllocation (SAPA) problem. In this paper, we develop a multi-agent Deep\nReinforcement Learning (DeepRL) based framework for joint optimization of power\nand subchannel allocation in an IAB network to maximize the downlink data rate.\nSAPA using DDQN (Double Deep Q-Learning Network) can handle computationally\nexpensive problems with huge action spaces associated with multiple users and\nnodes. Unlike the conventional methods such as game theory, fractional\nprogramming, and convex optimization, which in practice demand more and more\naccurate network information, the multi-agent DeepRL approach requires less\nenvironment network information. Simulation results show the proposed scheme's\npromising performance when compared with baseline (Deep Q-Learning Network and\nRandom) schemes.\n","authors":["Lakshya Jagadish","Banashree Sarma","R. Manivasakan"],"pdf_url":"https://arxiv.org/pdf/2309.00144v1.pdf","comment":"7 pages, 6 figures, Accepted at the European Conference on\n Communication Systems (ECCS) 2023"},{"id":"http://arxiv.org/abs/2309.00140v1","updated":"2023-08-31T21:25:57Z","published":"2023-08-31T21:25:57Z","title":"Improving vision-inspired keyword spotting using dynamic module skipping\n in streaming conformer encoder","summary":" Using a vision-inspired keyword spotting framework, we propose an\narchitecture with input-dependent dynamic depth capable of processing streaming\naudio. Specifically, we extend a conformer encoder with trainable binary gates\nthat allow us to dynamically skip network modules according to the input audio.\nOur approach improves detection and localization accuracy on continuous speech\nusing Librispeech top-1000 most frequent words while maintaining a small memory\nfootprint. The inclusion of gates also reduces the average amount of processing\nwithout affecting the overall performance. These benefits are shown to be even\nmore pronounced using the Google speech commands dataset placed over background\nnoise where up to 97% of the processing is skipped on non-speech inputs,\ntherefore making our method particularly interesting for an always-on keyword\nspotter.\n","authors":["Alexandre Bittar","Paul Dixon","Mohammad Samragh","Kumari Nishu","Devang Naik"],"pdf_url":"https://arxiv.org/pdf/2309.00140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00136v1","updated":"2023-08-31T21:20:58Z","published":"2023-08-31T21:20:58Z","title":"Predicting Financial Market Trends using Time Series Analysis and\n Natural Language Processing","summary":" Forecasting financial market trends through time series analysis and natural\nlanguage processing poses a complex and demanding undertaking, owing to the\nnumerous variables that can influence stock prices. These variables encompass a\nspectrum of economic and political occurrences, as well as prevailing public\nattitudes. Recent research has indicated that the expression of public\nsentiments on social media platforms such as Twitter may have a noteworthy\nimpact on the determination of stock prices. The objective of this study was to\nassess the viability of Twitter sentiments as a tool for predicting stock\nprices of major corporations such as Tesla, Apple. Our study has revealed a\nrobust association between the emotions conveyed in tweets and fluctuations in\nstock prices. Our findings indicate that positivity, negativity, and\nsubjectivity are the primary determinants of fluctuations in stock prices. The\ndata was analyzed utilizing the Long-Short Term Memory neural network (LSTM)\nmodel, which is currently recognized as the leading methodology for predicting\nstock prices by incorporating Twitter sentiments and historical stock prices\ndata. The models utilized in our study demonstrated a high degree of\nreliability and yielded precise outcomes for the designated corporations. In\nsummary, this research emphasizes the significance of incorporating public\nopinions into the prediction of stock prices. The application of Time Series\nAnalysis and Natural Language Processing methodologies can yield significant\nscientific findings regarding financial market patterns, thereby facilitating\ninformed decision-making among investors. The results of our study indicate\nthat the utilization of Twitter sentiments can serve as a potent instrument for\nforecasting stock prices, and ought to be factored in when formulating\ninvestment strategies.\n","authors":["Ali Asgarov"],"pdf_url":"https://arxiv.org/pdf/2309.00136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11055v2","updated":"2023-08-31T21:09:03Z","published":"2023-02-21T23:16:23Z","title":"SGD learning on neural networks: leap complexity and saddle-to-saddle\n dynamics","summary":" We investigate the time complexity of SGD learning on fully-connected neural\nnetworks with isotropic data. We put forward a complexity measure -- the leap\n-- which measures how \"hierarchical\" target functions are. For $d$-dimensional\nuniform Boolean or isotropic Gaussian data, our main conjecture states that the\ntime complexity to learn a function $f$ with low-dimensional support is\n$\\tilde\\Theta (d^{\\max(\\mathrm{Leap}(f),2)})$. We prove a version of this\nconjecture for a class of functions on Gaussian isotropic data and 2-layer\nneural networks, under additional technical assumptions on how SGD is run. We\nshow that the training sequentially learns the function support with a\nsaddle-to-saddle dynamic. Our result departs from [Abbe et al. 2022] by going\nbeyond leap 1 (merged-staircase functions), and by going beyond the mean-field\nand gradient flow approximations that prohibit the full complexity control\nobtained here. Finally, we note that this gives an SGD complexity for the full\ntraining trajectory that matches that of Correlational Statistical Query (CSQ)\nlower-bounds.\n","authors":["Emmanuel Abbe","Enric Boix-Adsera","Theodor Misiakiewicz"],"pdf_url":"https://arxiv.org/pdf/2302.11055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00127v1","updated":"2023-08-31T20:25:54Z","published":"2023-08-31T20:25:54Z","title":"FTA: Stealthy and Robust Backdoor Attack with Flexible Trigger on\n Federated Learning","summary":" Current backdoor attacks against federated learning (FL) strongly rely on\nuniversal triggers or semantic patterns, which can be easily detected and\nfiltered by certain defense mechanisms such as norm clipping, comparing\nparameter divergences among local updates. In this work, we propose a new\nstealthy and robust backdoor attack with flexible triggers against FL defenses.\nTo achieve this, we build a generative trigger function that can learn to\nmanipulate the benign samples with an imperceptible flexible trigger pattern\nand simultaneously make the trigger pattern include the most significant hidden\nfeatures of the attacker-chosen label. Moreover, our trigger generator can keep\nlearning and adapt across different rounds, allowing it to adjust to changes in\nthe global model. By filling the distinguishable difference (the mapping\nbetween the trigger pattern and target label), we make our attack naturally\nstealthy. Extensive experiments on real-world datasets verify the effectiveness\nand stealthiness of our attack compared to prior attacks on decentralized\nlearning framework with eight well-studied defenses.\n","authors":["Yanqi Qiao","Congwen Chen","Rui Wang","Kaitai Liang"],"pdf_url":"https://arxiv.org/pdf/2309.00127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00125v1","updated":"2023-08-31T20:24:51Z","published":"2023-08-31T20:24:51Z","title":"Differentially Private Functional Summaries via the Independent\n Component Laplace Process","summary":" In this work, we propose a new mechanism for releasing differentially private\nfunctional summaries called the Independent Component Laplace Process, or ICLP,\nmechanism. By treating the functional summaries of interest as truly\ninfinite-dimensional objects and perturbing them with the ICLP noise, this new\nmechanism relaxes assumptions on data trajectories and preserves higher utility\ncompared to classical finite-dimensional subspace embedding approaches in the\nliterature. We establish the feasibility of the proposed mechanism in multiple\nfunction spaces. Several statistical estimation problems are considered, and we\ndemonstrate by slightly over-smoothing the summary, the privacy cost will not\ndominate the statistical error and is asymptotically negligible. Numerical\nexperiments on synthetic and real datasets demonstrate the efficacy of the\nproposed mechanism.\n","authors":["Haotian Lin","Matthew Reimherr"],"pdf_url":"https://arxiv.org/pdf/2309.00125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14613v3","updated":"2023-08-31T19:57:50Z","published":"2022-09-29T08:15:29Z","title":"Fair admission risk prediction with proportional multicalibration","summary":" Fair calibration is a widely desirable fairness criteria in risk prediction\ncontexts. One way to measure and achieve fair calibration is with\nmulticalibration. Multicalibration constrains calibration error among\nflexibly-defined subpopulations while maintaining overall calibration. However,\nmulticalibrated models can exhibit a higher percent calibration error among\ngroups with lower base rates than groups with higher base rates. As a result,\nit is possible for a decision-maker to learn to trust or distrust model\npredictions for specific groups. To alleviate this, we propose\n\\emph{proportional multicalibration}, a criteria that constrains the percent\ncalibration error among groups and within prediction bins. We prove that\nsatisfying proportional multicalibration bounds a model's multicalibration as\nwell its \\emph{differential calibration}, a fairness criteria that directly\nmeasures how closely a model approximates sufficiency. Therefore,\nproportionally calibrated models limit the ability of decision makers to\ndistinguish between model performance on different patient groups, which may\nmake the models more trustworthy in practice. We provide an efficient algorithm\nfor post-processing risk prediction models for proportional multicalibration\nand evaluate it empirically. We conduct simulation studies and investigate a\nreal-world application of PMC-postprocessing to prediction of emergency\ndepartment patient admissions. We observe that proportional multicalibration is\na promising criteria for controlling simultaneous measures of calibration\nfairness of a model over intersectional groups with virtually no cost in terms\nof classification performance.\n","authors":["William La Cava","Elle Lett","Guangya Wan"],"pdf_url":"https://arxiv.org/pdf/2209.14613v3.pdf","comment":"Published in the 2023 Conference on Health, Inference, and Learning\n (CHIL). Best paper award"},{"id":"http://arxiv.org/abs/2308.12114v2","updated":"2023-08-31T19:43:17Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Model sparsification in deep learning promotes simpler, more interpretable\nmodels with fewer parameters. This not only reduces the model's memory\nfootprint and computational needs but also shortens inference time. This work\nfocuses on creating sparse models optimized for multiple tasks with fewer\nparameters. These parsimonious models also possess the potential to match or\noutperform dense models in terms of performance. In this work, we introduce\nchannel-wise l1/l2 group sparsity in the shared convolutional layers parameters\n(or weights) of the multi-task learning model. This approach facilitates the\nremoval of extraneous groups i.e., channels (due to l1 regularization) and also\nimposes a penalty on the weights, further enhancing the learning efficiency for\nall tasks (due to l2 regularization). We analyzed the results of group sparsity\nin both single-task and multi-task settings on two widely-used Multi-Task\nLearning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which\nconsist of three different computer vision tasks each, multi-task models with\napproximately 70% sparsity outperform their dense equivalents. We also\ninvestigate how changing the degree of sparsification influences the model's\nperformance, the overall sparsity percentage, the patterns of sparsity, and the\ninference time.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2110.06282v4","updated":"2023-08-31T19:41:31Z","published":"2021-10-12T19:05:06Z","title":"The Rich Get Richer: Disparate Impact of Semi-Supervised Learning","summary":" Semi-supervised learning (SSL) has demonstrated its potential to improve the\nmodel accuracy for a variety of learning tasks when the high-quality supervised\ndata is severely limited. Although it is often established that the average\naccuracy for the entire population of data is improved, it is unclear how SSL\nfares with different sub-populations. Understanding the above question has\nsubstantial fairness implications when different sub-populations are defined by\nthe demographic groups that we aim to treat fairly. In this paper, we reveal\nthe disparate impacts of deploying SSL: the sub-population who has a higher\nbaseline accuracy without using SSL (the \"rich\" one) tends to benefit more from\nSSL; while the sub-population who suffers from a low baseline accuracy (the\n\"poor\" one) might even observe a performance drop after adding the SSL module.\nWe theoretically and empirically establish the above observation for a broad\nfamily of SSL algorithms, which either explicitly or implicitly use an\nauxiliary \"pseudo-label\". Experiments on a set of image and text classification\ntasks confirm our claims. We introduce a new metric, Benefit Ratio, and promote\nthe evaluation of the fairness of SSL (Equalized Benefit Ratio). We further\ndiscuss how the disparate impact can be mitigated. We hope our paper will alarm\nthe potential pitfall of using SSL and encourage a multifaceted evaluation of\nfuture SSL algorithms.\n","authors":["Zhaowei Zhu","Tianyi Luo","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2110.06282v4.pdf","comment":"Published as a conference paper at ICLR 2022. Revised constants\n Theorems 1,2, and Lemma 3 (consider the union bound). Add acknowledgments to\n Nautilus"},{"id":"http://arxiv.org/abs/2309.00088v1","updated":"2023-08-31T19:07:50Z","published":"2023-08-31T19:07:50Z","title":"Deep Semi-Supervised Anomaly Detection for Finding Fraud in the Futures\n Market","summary":" Modern financial electronic exchanges are an exciting and fast-paced\nmarketplace where billions of dollars change hands every day. They are also\nrife with manipulation and fraud. Detecting such activity is a major\nundertaking, which has historically been a job reserved exclusively for humans.\nRecently, more research and resources have been focused on automating these\nprocesses via machine learning and artificial intelligence. Fraud detection is\noverwhelmingly associated with the greater field of anomaly detection, which is\nusually performed via unsupervised learning techniques because of the lack of\nlabeled data needed for supervised learning. However, a small quantity of\nlabeled data does often exist. This research article aims to evaluate the\nefficacy of a deep semi-supervised anomaly detection technique, called Deep\nSAD, for detecting fraud in high-frequency financial data. We use exclusive\nproprietary limit order book data from the TMX exchange in Montr\\'eal, with a\nsmall set of true labeled instances of fraud, to evaluate Deep SAD against its\nunsupervised predecessor. We show that incorporating a small amount of labeled\ndata into an unsupervised anomaly detection framework can greatly improve its\naccuracy.\n","authors":["Timothy DeLise"],"pdf_url":"https://arxiv.org/pdf/2309.00088v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.00082v1","updated":"2023-08-31T18:43:04Z","published":"2023-08-31T18:43:04Z","title":"RePo: Resilient Model-Based Reinforcement Learning by Regularizing\n Posterior Predictability","summary":" Visual model-based RL methods typically encode image observations into\nlow-dimensional representations in a manner that does not eliminate redundant\ninformation. This leaves them susceptible to spurious variations -- changes in\ntask-irrelevant components such as background distractors or lighting\nconditions. In this paper, we propose a visual model-based RL method that\nlearns a latent representation resilient to such spurious variations. Our\ntraining objective encourages the representation to be maximally predictive of\ndynamics and reward, while constraining the information flow from the\nobservation to the latent representation. We demonstrate that this objective\nsignificantly bolsters the resilience of visual model-based RL methods to\nvisual distractors, allowing them to operate in dynamic environments. We then\nshow that while the learned encoder is resilient to spirious variations, it is\nnot invariant under significant distribution shift. To address this, we propose\na simple reward-free alignment procedure that enables test time adaptation of\nthe encoder. This allows for quick adaptation to widely differing environments\nwithout having to relearn the dynamics and policy. Our effort is a step towards\nmaking model-based RL a practical and useful tool for dynamic, diverse domains.\nWe show its effectiveness in simulation benchmarks with significant spurious\nvariations as well as a real-world egocentric navigation task with noisy TVs in\nthe background. Videos and code at https://zchuning.github.io/repo-website/.\n","authors":["Chuning Zhu","Max Simchowitz","Siri Gadipudi","Abhishek Gupta"],"pdf_url":"https://arxiv.org/pdf/2309.00082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00079v1","updated":"2023-08-31T18:33:05Z","published":"2023-08-31T18:33:05Z","title":"On the Implicit Bias of Adam","summary":" In previous literature, backward error analysis was used to find ordinary\ndifferential equations (ODEs) approximating the gradient descent trajectory. It\nwas found that finite step sizes implicitly regularize solutions because terms\nappearing in the ODEs penalize the two-norm of the loss gradients. We prove\nthat the existence of similar implicit regularization in RMSProp and Adam\ndepends on their hyperparameters and the training stage, but with a different\n\"norm\" involved: the corresponding ODE terms either penalize the (perturbed)\none-norm of the loss gradients or, on the contrary, hinder its decrease (the\nlatter case being typical). We also conduct numerical experiments and discuss\nhow the proven facts can influence generalization.\n","authors":["Matias D. Cattaneo","Jason M. Klusowski","Boris Shigida"],"pdf_url":"https://arxiv.org/pdf/2309.00079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.08806v2","updated":"2023-08-31T18:26:56Z","published":"2021-04-18T10:33:38Z","title":"Best Practices for Noise-Based Augmentation to Improve the Performance\n of Deployable Speech-Based Emotion Recognition Systems","summary":" Speech emotion recognition is an important component of any human centered\nsystem. But speech characteristics produced and perceived by a person can be\ninfluenced by a multitude of reasons, both desirable such as emotion, and\nundesirable such as noise. To train robust emotion recognition models, we need\na large, yet realistic data distribution, but emotion datasets are often small\nand hence are augmented with noise. Often noise augmentation makes one\nimportant assumption, that the prediction label should remain the same in\npresence or absence of noise, which is true for automatic speech recognition\nbut not necessarily true for perception based tasks. In this paper we make\nthree novel contributions. We validate through crowdsourcing that the presence\nof noise does change the annotation label and hence may alter the original\nground truth label. We then show how disregarding this knowledge and assuming\nconsistency in ground truth labels propagates to downstream evaluation of ML\nmodels, both for performance evaluation and robustness testing. We end the\npaper with a set of recommendations for noise augmentations in speech emotion\nrecognition datasets.\n","authors":["Mimansa Jaiswal","Emily Mower Provost"],"pdf_url":"https://arxiv.org/pdf/2104.08806v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00071v1","updated":"2023-08-31T18:18:07Z","published":"2023-08-31T18:18:07Z","title":"YaRN: Efficient Context Window Extension of Large Language Models","summary":" Rotary Position Embeddings (RoPE) have been shown to effectively encode\npositional information in transformer-based language models. However, these\nmodels fail to generalize past the sequence length they were trained on. We\npresent YaRN (Yet another RoPE extensioN method), a compute-efficient method to\nextend the context window of such models, requiring 10x less tokens and 2.5x\nless training steps than previous methods. Using YaRN, we show that LLaMA\nmodels can effectively utilize and extrapolate to context lengths much longer\nthan their original pre-training would allow, while also surpassing previous\nthe state-of-the-art at context window extension. In addition, we demonstrate\nthat YaRN exhibits the capability to extrapolate beyond the limited context of\na fine-tuning dataset. We publish the checkpoints of Llama 2 7B/13B fine-tuned\nusing YaRN with 64k and 128k context windows at\nhttps://github.com/jquesnelle/yarn\n","authors":["Bowen Peng","Jeffrey Quesnelle","Honglu Fan","Enrico Shippole"],"pdf_url":"https://arxiv.org/pdf/2309.00071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11696v2","updated":"2023-08-31T18:18:03Z","published":"2023-08-22T17:59:30Z","title":"Efficient Benchmarking (of Language Models)","summary":" The increasing versatility of language models LMs has given rise to a new\nclass of benchmarks that comprehensively assess a broad range of capabilities.\nSuch benchmarks are associated with massive computational costs reaching\nthousands of GPU hours per model. However the efficiency aspect of these\nevaluation efforts had raised little discussion in the literature. In this work\nwe present the problem of Efficient Benchmarking namely intelligently reducing\nthe computation costs of LM evaluation without compromising reliability. Using\nthe HELM benchmark as a test case we investigate how different benchmark design\nchoices affect the computation-reliability tradeoff. We propose to evaluate the\nreliability of such decisions by using a new measure Decision Impact on\nReliability DIoR for short. We find for example that the current leader on HELM\nmay change by merely removing a low-ranked model from the benchmark and observe\nthat a handful of examples suffice to obtain the correct benchmark ranking.\nConversely a slightly different choice of HELM scenarios varies ranking widely.\nBased on our findings we outline a set of concrete recommendations for more\nefficient benchmark design and utilization practices leading to dramatic cost\nsavings with minimal loss of benchmark reliability often reducing computation\nby x100 or more.\n","authors":["Yotam Perlitz","Elron Bandel","Ariel Gera","Ofir Arviv","Liat Ein-Dor","Eyal Shnarch","Noam Slonim","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2308.11696v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2307.09312v2","updated":"2023-08-31T15:32:01Z","published":"2023-07-18T14:57:12Z","title":"Multi-Modal Discussion Transformer: Integrating Text, Images and Graph\n Transformers to Detect Hate Speech on Social Media","summary":" We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal\ngraph-based transformer model for detecting hate speech in online social\nnetworks, such as Reddit discussions. In contrast to traditional comment-only\nmethods, our approach to labelling a comment as hate speech involves a holistic\nanalysis of text and images grounded in the discussion context. This is done by\nleveraging graph transformers to capture the contextual relationships in the\nentire discussion surrounding a comment and grounding the interwoven fusion\nlayers that combine individual comments' text and image embeddings instead of\nprocessing modalities separately. We compare the performance of our model to\nbaselines that only process individual comments and conduct extensive ablation\nstudies. To evaluate our work, we present a new dataset, HatefulDiscussions,\ncomprising complete multi-modal discussions from multiple online communities on\nReddit. We conclude with future work for multimodal solutions to deliver social\nvalue in online contexts, arguing that capturing a holistic view of a\nconversation significantly advances the effort to detect anti-social behaviour.\n","authors":["Liam Hebert","Gaurav Sahu","Yuxuan Guo","Nanda Kishore Sreenivas","Lukasz Golab","Robin Cohen"],"pdf_url":"https://arxiv.org/pdf/2307.09312v2.pdf","comment":"Under Submission"},{"id":"http://arxiv.org/abs/2308.16725v1","updated":"2023-08-31T13:41:34Z","published":"2023-08-31T13:41:34Z","title":"Terrain Diffusion Network: Climatic-Aware Terrain Generation with\n Geological Sketch Guidance","summary":" Sketch-based terrain generation seeks to create realistic landscapes for\nvirtual environments in various applications such as computer games, animation\nand virtual reality. Recently, deep learning based terrain generation has\nemerged, notably the ones based on generative adversarial networks (GAN).\nHowever, these methods often struggle to fulfill the requirements of flexible\nuser control and maintain generative diversity for realistic terrain.\nTherefore, we propose a novel diffusion-based method, namely terrain diffusion\nnetwork (TDN), which actively incorporates user guidance for enhanced\ncontrollability, taking into account terrain features like rivers, ridges,\nbasins, and peaks. Instead of adhering to a conventional monolithic denoising\nprocess, which often compromises the fidelity of terrain details or the\nalignment with user control, a multi-level denoising scheme is proposed to\ngenerate more realistic terrains by taking into account fine-grained details,\nparticularly those related to climatic patterns influenced by erosion and\ntectonic activities. Specifically, three terrain synthesisers are designed for\nstructural, intermediate, and fine-grained level denoising purposes, which\nallow each synthesiser concentrate on a distinct terrain aspect. Moreover, to\nmaximise the efficiency of our TDN, we further introduce terrain and sketch\nlatent spaces for the synthesizers with pre-trained terrain autoencoders.\nComprehensive experiments on a new dataset constructed from NASA Topology\nImages clearly demonstrate the effectiveness of our proposed method, achieving\nthe state-of-the-art performance. Our code and dataset will be publicly\navailable.\n","authors":["Zexin Hu","Kun Hu","Clinton Mo","Lei Pan","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16418v1","updated":"2023-08-31T03:09:25Z","published":"2023-08-31T03:09:25Z","title":"End-Edge Coordinated Joint Encoding and Neural Enhancement for Low-Light\n Video Analytics","summary":" In this paper, we investigate video analytics in low-light environments, and\npropose an end-edge coordinated system with joint video encoding and\nenhancement. It adaptively transmits low-light videos from cameras and performs\nenhancement and inference tasks at the edge. Firstly, according to our\nobservations, both encoding and enhancement for low-light videos have a\nsignificant impact on inference accuracy, which directly influences bandwidth\nand computation overhead. Secondly, due to the limitation of built-in\ncomputation resources, cameras perform encoding and transmitting frames to the\nedge. The edge executes neural enhancement to process low contrast, detail\nloss, and color distortion on low-light videos before inference. Finally, an\nadaptive controller is designed at the edge to select quantization parameters\nand scales of neural enhancement networks, aiming to improve the inference\naccuracy and meet the latency requirements. Extensive real-world experiments\ndemon-strate that, the proposed system can achieve a better trade-off between\ncommunication and computation resources and optimize the inference accuracy.\n","authors":["Yuanyi He","Peng Yang","Tian Qin","Ning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16417v1","updated":"2023-08-31T03:03:29Z","published":"2023-08-31T03:03:29Z","title":"Edge-Assisted Lightweight Region-of-Interest Extraction and Transmission\n for Vehicle Perception","summary":" To enhance on-road environmental perception for autonomous driving, accurate\nand real-time analytics on high-resolution video frames generated from on-board\ncameras be-comes crucial. In this paper, we design a lightweight object\nlocation method based on class activation mapping (CAM) to rapidly capture the\nregion of interest (RoI) boxes that contain driving safety related objects from\non-board cameras, which can not only improve the inference accuracy of vision\ntasks, but also reduce the amount of transmitted data. Considering the limited\non-board computation resources, the RoI boxes extracted from the raw image are\noffloaded to the edge for further processing. Considering both the dynamics of\nvehicle-to-edge communications and the limited edge resources, we propose an\nadaptive RoI box offloading algorithm to ensure prompt and accurate inference\nby adjusting the down-sampling rate of each box. Extensive experimental results\non four high-resolution video streams demonstrate that our approach can\neffectively improve the overall accuracy by up to 16% and reduce the\ntransmission demand by up to 49%, compared with other benchmarks.\n","authors":["Yan Cheng","Peng Yang","Ning Zhang","Jiawei Hou"],"pdf_url":"https://arxiv.org/pdf/2308.16417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16413v1","updated":"2023-08-31T02:54:30Z","published":"2023-08-31T02:54:30Z","title":"Edge-Assisted On-Device Model Update for Video Analytics in Adverse\n Environments","summary":" While large deep neural networks excel at general video analytics tasks, the\nsignificant demand on computing capacity makes them infeasible for real-time\ninference on resource-constrained end cam-eras. In this paper, we propose an\nedge-assisted framework that continuously updates the lightweight model\ndeployed on the end cameras to achieve accurate predictions in adverse\nenvironments. This framework consists of three modules, namely, a key frame\nextractor, a trigger controller, and a retraining manager. The low-cost key\nframe extractor obtains frames that can best represent the current environment.\nThose frames are then transmitted and buffered as the retraining data for model\nupdate at the edge server. Once the trigger controller detects a significant\naccuracy drop in the selected frames, the retraining manager outputs the\noptimal retraining configuration balancing the accuracy and time cost. We\nprototype our system on two end devices of different computing capacities with\none edge server. The results demonstrate that our approach significantly\nimproves accuracy across all tested adverse environment scenarios (up to 24%)\nand reduces more than 50% of the retraining time compared to existing\nbenchmarks.\n","authors":["Yuxin Kong","Peng Yang","Yan Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.16413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16383v1","updated":"2023-08-31T01:00:59Z","published":"2023-08-31T01:00:59Z","title":"Separate and Locate: Rethink the Text in Text-based Visual Question\n Answering","summary":" Text-based Visual Question Answering (TextVQA) aims at answering questions\nabout the text in images. Most works in this field focus on designing network\nstructures or pre-training tasks. All these methods list the OCR texts in\nreading order (from left to right and top to bottom) to form a sequence, which\nis treated as a natural language ``sentence''. However, they ignore the fact\nthat most OCR words in the TextVQA task do not have a semantical contextual\nrelationship. In addition, these approaches use 1-D position embedding to\nconstruct the spatial relation between OCR tokens sequentially, which is not\nreasonable. The 1-D position embedding can only represent the left-right\nsequence relationship between words in a sentence, but not the complex spatial\nposition relationship. To tackle these problems, we propose a novel method\nnamed Separate and Locate (SaL) that explores text contextual cues and designs\nspatial position embedding to construct spatial relations between OCR texts.\nSpecifically, we propose a Text Semantic Separate (TSS) module that helps the\nmodel recognize whether words have semantic contextual relations. Then, we\nintroduce a Spatial Circle Position (SCP) module that helps the model better\nconstruct and reason the spatial position relationships between OCR texts. Our\nSaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA\nand ST-VQA datasets. Compared with the pre-training state-of-the-art method\npre-trained on 64 million pre-training samples, our method, without any\npre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on\nTextVQA and ST-VQA. Our code and models will be released at\nhttps://github.com/fangbufang/SaL.\n","authors":["Chengyang Fang","Jiangnan Li","Liang Li","Can Ma","Dayong Hu"],"pdf_url":"https://arxiv.org/pdf/2308.16383v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.11300v2","updated":"2023-08-31T22:33:54Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Foundation Models utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. In this paper, we propose a\nnew framework that includes the Domain Foundation Model (DFM), bridging the gap\nbetween the General Foundation Model (GFM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\ntried several Parameter-Efficient Fine-Tuning methods on RS5M to implement the\nDFM. Experimental results show that our proposed dataset are highly effective\nfor various tasks, improving upon the baseline by $8 \\% \\sim 16 \\%$ in\nzero-shot classification tasks, and obtaining good results in both\nVision-Language Retrieval and Semantic Localization tasks.\n\\url{https://github.com/om-ai-lab/RS5M}\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v2.pdf","comment":"RS5M dataset v4"}]},"2023-09-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.00615v1","updated":"2023-09-01T17:59:47Z","published":"2023-09-01T17:59:47Z","title":"Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D\n Understanding, Generation, and Instruction Following","summary":" We introduce Point-Bind, a 3D multi-modality model aligning point clouds with\n2D image, language, audio, and video. Guided by ImageBind, we construct a joint\nembedding space between 3D and multi-modalities, enabling many promising\napplications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D\nopen-world understanding. On top of this, we further present Point-LLM, the\nfirst 3D large language model (LLM) following 3D multi-modal instructions. By\nparameter-efficient fine-tuning techniques, Point-LLM injects the semantics of\nPoint-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction\ndata, but exhibits superior 3D and multi-modal question-answering capacity. We\nhope our work may cast a light on the community for extending 3D point clouds\nto multi-modality applications. Code is available at\nhttps://github.com/ZiyuGuo99/Point-Bind_Point-LLM.\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Yiwen Tang","Xianzheng Ma","Jiaming Han","Kexin Chen","Peng Gao","Xianzhi Li","Hongsheng Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2309.00615v1.pdf","comment":"Work in progress. Code is available at\n https://github.com/ZiyuGuo99/Point-Bind_Point-LLM"},{"id":"http://arxiv.org/abs/2309.00614v1","updated":"2023-09-01T17:59:44Z","published":"2023-09-01T17:59:44Z","title":"Baseline Defenses for Adversarial Attacks Against Aligned Language\n Models","summary":" As Large Language Models quickly become ubiquitous, their security\nvulnerabilities are critical to understand. Recent work shows that text\noptimizers can produce jailbreaking prompts that bypass moderation and\nalignment. Drawing from the rich body of work on adversarial machine learning,\nwe approach these attacks with three questions: What threat models are\npractically useful in this domain? How do baseline defense techniques perform\nin this new domain? How does LLM security differ from computer vision?\n We evaluate several baseline defense strategies against leading adversarial\nattacks on LLMs, discussing the various settings in which each is feasible and\neffective. Particularly, we look at three types of defenses: detection\n(perplexity based), input preprocessing (paraphrase and retokenization), and\nadversarial training. We discuss white-box and gray-box settings and discuss\nthe robustness-performance trade-off for each of the defenses considered.\nSurprisingly, we find much more success with filtering and preprocessing than\nwe would expect from other domains, such as vision, providing a first\nindication that the relative strengths of these defenses may be weighed\ndifferently in these domains.\n","authors":["Neel Jain","Avi Schwarzschild","Yuxin Wen","Gowthami Somepalli","John Kirchenbauer","Ping-yeh Chiang","Micah Goldblum","Aniruddha Saha","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2309.00614v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2302.08468v3","updated":"2023-09-01T17:37:42Z","published":"2023-02-16T18:23:22Z","title":"LEVER: Learning to Verify Language-to-Code Generation with Execution","summary":" The advent of large language models trained on code (code LLMs) has led to\nsignificant progress in language-to-code generation. State-of-the-art\napproaches in this area combine LLM decoding with sample pruning and reranking\nusing test cases or heuristics based on the execution results. However, it is\nchallenging to obtain test cases for many real-world language-to-code\napplications, and heuristics cannot well capture the semantic features of the\nexecution results, such as data type and value range, which often indicates the\ncorrectness of the program. In this work, we propose LEVER, a simple approach\nto improve language-to-code generation by learning to verify the generated\nprograms with their execution results. Specifically, we train verifiers to\ndetermine whether a program sampled from the LLMs is correct or not based on\nthe natural language input, the program itself and its execution results. The\nsampled programs are reranked by combining the verification score with the LLM\ngeneration probability, and marginalizing over programs with the same execution\nresults. On four datasets across the domains of table QA, math QA and basic\nPython programming, LEVER consistently improves over the base code LLMs(4.6% to\n10.9% with code-davinci-002) and achieves new state-of-the-art results on all\nof them.\n","authors":["Ansong Ni","Srini Iyer","Dragomir Radev","Ves Stoyanov","Wen-tau Yih","Sida I. Wang","Xi Victoria Lin"],"pdf_url":"https://arxiv.org/pdf/2302.08468v3.pdf","comment":"ICML'23; code available at https://github.com/niansong1996/lever"},{"id":"http://arxiv.org/abs/2308.10248v2","updated":"2023-09-01T17:07:29Z","published":"2023-08-20T12:21:05Z","title":"Activation Addition: Steering Language Models Without Optimization","summary":" Reliably controlling the behavior of large language models is a pressing open\nproblem. Existing methods include supervised finetuning, reinforcement learning\nfrom human feedback, prompt engineering, and guided decoding. We instead\ninvestigate activation engineering: modifying activations at inference time to\npredictably alter model behavior. In particular, we bias the forward pass with\nan added 'steering vector' implicitly specified through natural language.\n Unlike past work which learned these steering vectors, our Activation\nAddition (ActAdd) method computes them by taking the activation differences\nthat result from pairs of prompts. We demonstrate ActAdd on GPT-2 on\nOpenWebText and ConceptNet. Our inference-time approach yields control over\nhigh-level properties of output and preserves off-target model performance. It\ninvolves far less compute and implementation effort than finetuning, allows\nusers to provide natural language specifications, and its overhead scales\nnaturally with model size.\n","authors":["Alexander Matt Turner","Lisa Thiergart","David Udell","Gavin Leech","Ulisse Mini","Monte MacDiarmid"],"pdf_url":"https://arxiv.org/pdf/2308.10248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15245v3","updated":"2023-09-01T16:11:40Z","published":"2023-06-27T06:58:03Z","title":"C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue\n Evaluation","summary":" Existing reference-free turn-level evaluation metrics for chatbots\ninadequately capture the interaction between the user and the system.\nConsequently, they often correlate poorly with human evaluations. To address\nthis issue, we propose a novel model-agnostic approach that leverages\nConditional Pointwise Mutual Information (C-PMI) to measure the turn-level\ninteraction between the system and the user based on a given evaluation\ndimension. Experimental results on the widely used FED dialogue evaluation\ndataset demonstrate that our approach significantly improves the correlation\nwith human judgment compared with existing evaluation systems. By replacing the\nnegative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve\na relative 62.6% higher Spearman correlation on average for the FED evaluation\nmetric. Our code is publicly available at https://github.com/renll/C-PMI.\n","authors":["Liliang Ren","Mankeerat Sidhu","Qi Zeng","Revanth Gangi Reddy","Heng Ji","ChengXiang Zhai"],"pdf_url":"https://arxiv.org/pdf/2306.15245v3.pdf","comment":"Published at ACL2023 DialDoc Workshop; Updated Results"},{"id":"http://arxiv.org/abs/2306.11702v2","updated":"2023-09-01T15:40:40Z","published":"2023-06-20T17:30:02Z","title":"Lingua Manga: A Generic Large Language Model Centric System for Data\n Curation","summary":" Data curation is a wide-ranging area which contains many critical but\ntime-consuming data processing tasks. However, the diversity of such tasks\nmakes it challenging to develop a general-purpose data curation system. To\naddress this issue, we present Lingua Manga, a user-friendly and versatile\nsystem that utilizes pre-trained large language models. Lingua Manga offers\nautomatic optimization for achieving high performance and label efficiency\nwhile facilitating flexible and rapid development. Through three example\napplications with distinct objectives and users of varying levels of technical\nproficiency, we demonstrate that Lingua Manga can effectively assist both\nskilled programmers and low-code or even no-code users in addressing data\ncuration challenges.\n","authors":["Zui Chen","Lei Cao","Sam Madden"],"pdf_url":"https://arxiv.org/pdf/2306.11702v2.pdf","comment":"4 pages, 6 figures, VLDB 2023 Demo paper"},{"id":"http://arxiv.org/abs/2307.01540v2","updated":"2023-09-01T15:31:21Z","published":"2023-07-04T07:51:37Z","title":"Learning to Prompt in the Classroom to Understand AI Limits: A pilot\n study","summary":" Artificial intelligence's (AI) progress holds great promise in tackling\npressing societal concerns such as health and climate. Large Language Models\n(LLM) and the derived chatbots, like ChatGPT, have highly improved the natural\nlanguage processing capabilities of AI systems allowing them to process an\nunprecedented amount of unstructured data. However, the ensuing excitement has\nled to negative sentiments, even as AI methods demonstrate remarkable\ncontributions (e.g. in health and genetics). A key factor contributing to this\nsentiment is the misleading perception that LLMs can effortlessly provide\nsolutions across domains, ignoring their limitations such as hallucinations and\nreasoning constraints. Acknowledging AI fallibility is crucial to address the\nimpact of dogmatic overconfidence in possibly erroneous suggestions generated\nby LLMs. At the same time, it can reduce fear and other negative attitudes\ntoward AI. This necessitates comprehensive AI literacy interventions that\neducate the public about LLM constraints and effective usage techniques, i.e\nprompting strategies. With this aim, a pilot educational intervention was\nperformed in a high school with 21 students. It involved presenting high-level\nconcepts about intelligence, AI, and LLMs, followed by practical exercises\ninvolving ChatGPT in creating natural educational conversations and applying\nestablished prompting strategies. Encouraging preliminary results emerged,\nincluding high appreciation of the activity, improved interaction quality with\nthe LLM, reduced negative AI sentiments, and a better grasp of limitations,\nspecifically unreliability, limited understanding of commands leading to\nunsatisfactory responses, and limited presentation flexibility. Our aim is to\nexplore AI acceptance factors and refine this approach for more controlled\nfuture studies.\n","authors":["Emily Theophilou","Cansu Koyuturk","Mona Yavari","Sathya Bursic","Gregor Donabauer","Alessia Telari","Alessia Testa","Raffaele Boiano","Davinia Hernandez-Leo","Martin Ruskov","Davide Taibi","Alessandro Gabbiadini","Dimitri Ognibene"],"pdf_url":"https://arxiv.org/pdf/2307.01540v2.pdf","comment":"Accepted for AIXIA 2023 22nd International Conference of the Italian\n Association for Artificial Intelligence 6 - 9 Nov, 2023, Rome, Italy"},{"id":"http://arxiv.org/abs/2306.14704v3","updated":"2023-09-01T15:26:45Z","published":"2023-06-26T13:54:47Z","title":"Ontology Enrichment from Texts: A Biomedical Dataset for Concept\n Discovery and Placement","summary":" Mentions of new concepts appear regularly in texts and require automated\napproaches to harvest and place them into Knowledge Bases (KB), e.g.,\nontologies and taxonomies. Existing datasets suffer from three issues, (i)\nmostly assuming that a new concept is pre-discovered and cannot support\nout-of-KB mention discovery; (ii) only using the concept label as the input\nalong with the KB and thus lacking the contexts of a concept label; and (iii)\nmostly focusing on concept placement w.r.t a taxonomy of atomic concepts,\ninstead of complex concepts, i.e., with logical operators. To address these\nissues, we propose a new benchmark, adapting MedMentions dataset (PubMed\nabstracts) with SNOMED CT versions in 2014 and 2017 under the Diseases\nsub-category and the broader categories of Clinical finding, Procedure, and\nPharmaceutical / biologic product. We provide usage on the evaluation with the\ndataset for out-of-KB mention discovery and concept placement, adapting recent\nLarge Language Model based methods.\n","authors":["Hang Dong","Jiaoyan Chen","Yuan He","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2306.14704v3.pdf","comment":"5 pages, 1 figure, accepted for CIKM 2023. The dataset, data\n construction scripts, and baseline implementation are available at\n https://zenodo.org/record/8228005 (Zenodo) and\n https://github.com/KRR-Oxford/OET (GitHub)"},{"id":"http://arxiv.org/abs/2305.07795v2","updated":"2023-09-01T13:44:14Z","published":"2023-05-12T23:09:06Z","title":"Constructing Holistic Measures for Social Biases in Masked Language\n Models","summary":" Masked Language Models (MLMs) have been successful in many natural language\nprocessing tasks. However, real-world stereotype biases are likely to be\nreflected in MLMs due to their learning from large text corpora. Most of the\nevaluation metrics proposed in the past adopt different masking strategies,\ndesigned with the log-likelihood of MLMs. They lack holistic considerations\nsuch as variance for stereotype bias and anti-stereotype bias samples. In this\npaper, the log-likelihoods of stereotype bias and anti-stereotype bias samples\noutput by MLMs are considered Gaussian distributions. Two evaluation metrics,\nKullback Leibler Divergence Score (KLDivS) and Jensen Shannon Divergence Score\n(JSDivS) are proposed to evaluate social biases in MLMs The experimental\nresults on the public datasets StereoSet and CrowS-Pairs demonstrate that\nKLDivS and JSDivS are more stable and interpretable compared to the metrics\nproposed in the past.\n","authors":["Yang Liu","Yuexian Hou"],"pdf_url":"https://arxiv.org/pdf/2305.07795v2.pdf","comment":"We need to change the methodology in the paper appropriately cause us\n to change the title of the paper, so we need to withdraw it and subsequently\n resubmit the new version"},{"id":"http://arxiv.org/abs/2306.16805v2","updated":"2023-09-01T12:53:44Z","published":"2023-06-29T09:35:53Z","title":"CLIPAG: Towards Generator-Free Text-to-Image Generation","summary":" Perceptually Aligned Gradients (PAG) refer to an intriguing property observed\nin robust image classification models, wherein their input gradients align with\nhuman perception and pose semantic meanings. While this phenomenon has gained\nsignificant research attention, it was solely studied in the context of\nunimodal vision-only architectures. In this work, we extend the study of PAG to\nVision-Language architectures, which form the foundations for diverse\nimage-text tasks and applications. Through an adversarial robustification\nfinetuning of CLIP, we demonstrate that robust Vision-Language models exhibit\nPAG in contrast to their vanilla counterparts. This work reveals the merits of\nCLIP with PAG (CLIPAG) in several vision-language generative tasks. Notably, we\nshow that seamlessly integrating CLIPAG in a \"plug-n-play\" manner leads to\nsubstantial improvements in vision-language generative applications.\nFurthermore, leveraging its PAG property, CLIPAG enables text-to-image\ngeneration without any generative model, which typically requires huge\ngenerators.\n","authors":["Roy Ganz","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2306.16805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00424v1","updated":"2023-09-01T12:35:43Z","published":"2023-09-01T12:35:43Z","title":"CPSP: Learning Speech Concepts From Phoneme Supervision","summary":" For fine-grained generation and recognition tasks such as\nminimally-supervised text-to-speech (TTS), voice conversion (VC), and automatic\nspeech recognition (ASR), the intermediate representation extracted from speech\nshould contain information that is between text coding and acoustic coding. The\nlinguistic content is salient, while the paralinguistic information such as\nspeaker identity and acoustic details should be removed. However, existing\nmethods for extracting fine-grained intermediate representations from speech\nsuffer from issues of excessive redundancy and dimension explosion.\nAdditionally, existing contrastive learning methods in the audio field focus on\nextracting global descriptive information for downstream audio classification\ntasks, making them unsuitable for TTS, VC, and ASR tasks. To address these\nissues, we propose a method named Contrastive Phoneme-Speech Pretraining\n(CPSP), which uses three encoders, one decoder, and contrastive learning to\nbring phoneme and speech into a joint multimodal space, learning how to connect\nphoneme and speech at the frame level. The CPSP model is trained on 210k speech\nand phoneme text pairs, achieving minimally-supervised TTS, VC, and ASR. The\nproposed CPSP method offers a promising solution for fine-grained generation\nand recognition downstream tasks in speech processing. We provide a website\nwith audio samples.\n","authors":["Chunyu Qiang","Hao Li","Yixin Tian","Ruibo Fu","Tao Wang","Longbiao Wang","Jianwu Dang"],"pdf_url":"https://arxiv.org/pdf/2309.00424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15484v2","updated":"2023-09-01T12:16:20Z","published":"2023-07-28T11:20:23Z","title":"Minimally-Supervised Speech Synthesis with Conditional Diffusion Model\n and Language Model: A Comparative Study of Semantic Coding","summary":" Recently, there has been a growing interest in text-to-speech (TTS) methods\nthat can be trained with minimal supervision by combining two types of discrete\nspeech representations and using two sequence-to-sequence tasks to decouple\nTTS. However, existing methods suffer from three problems: the high\ndimensionality and waveform distortion of discrete speech representations, the\nprosodic averaging problem caused by the duration prediction model in\nnon-autoregressive frameworks, and the information redundancy and dimension\nexplosion problems of existing semantic encoding methods. To address these\nproblems, three progressive methods are proposed. First, we propose\nDiff-LM-Speech, an autoregressive structure consisting of a language model and\ndiffusion models, which models the semantic embedding into the mel-spectrogram\nbased on a diffusion model to achieve higher audio quality. We also introduce a\nprompt encoder structure based on a variational autoencoder and a prosody\nbottleneck to improve prompt representation ability. Second, we propose\nTetra-Diff-Speech, a non-autoregressive structure consisting of four diffusion\nmodel-based modules that design a duration diffusion model to achieve diverse\nprosodic expressions. Finally, we propose Tri-Diff-Speech, a non-autoregressive\nstructure consisting of three diffusion model-based modules that verify the\nnon-necessity of existing semantic encoding models and achieve the best\nresults. Experimental results show that our proposed methods outperform\nbaseline methods. We provide a website with audio samples.\n","authors":["Chunyu Qiang","Hao Li","Hao Ni","He Qu","Ruibo Fu","Tao Wang","Longbiao Wang","Jianwu Dang"],"pdf_url":"https://arxiv.org/pdf/2307.15484v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00386v1","updated":"2023-09-01T10:49:19Z","published":"2023-09-01T10:49:19Z","title":"Satisfiability Checking of Multi-Variable TPTL with Unilateral Intervals\n Is PSPACE-Complete","summary":" We investigate the decidability of the ${0,\\infty}$ fragment of Timed\nPropositional Temporal Logic (TPTL). We show that the satisfiability checking\nof TPTL$^{0,\\infty}$ is PSPACE-complete. Moreover, even its 1-variable fragment\n(1-TPTL$^{0,\\infty}$) is strictly more expressive than Metric Interval Temporal\nLogic (MITL) for which satisfiability checking is EXPSPACE complete. Hence, we\nhave a strictly more expressive logic with computationally easier\nsatisfiability checking. To the best of our knowledge, TPTL$^{0,\\infty}$ is the\nfirst multi-variable fragment of TPTL for which satisfiability checking is\ndecidable without imposing any bounds/restrictions on the timed words (e.g.\nbounded variability, bounded time, etc.). The membership in PSPACE is obtained\nby a reduction to the emptiness checking problem for a new \"non-punctual\"\nsubclass of Alternating Timed Automata with multiple clocks called Unilateral\nVery Weak Alternating Timed Automata (VWATA$^{0,\\infty}$) which we prove to be\nin PSPACE. We show this by constructing a simulation equivalent\nnon-deterministic timed automata whose number of clocks is polynomial in the\nsize of the given VWATA$^{0,\\infty}$.\n","authors":["Shankara Narayanan Krishna","Khushraj Nanik Madnani","Rupak Majumdar","Paritosh K. Pandya"],"pdf_url":"https://arxiv.org/pdf/2309.00386v1.pdf","comment":"Accepted in Concur 2023"},{"id":"http://arxiv.org/abs/2309.00384v1","updated":"2023-09-01T10:44:36Z","published":"2023-09-01T10:44:36Z","title":"BatchPrompt: Accomplish more with less","summary":" Many LLMs are trained to perform zero-shot or few-shot inference using\ninstruction-based prompts. Crafting prompts for these LLMs typically requires\nthe user to provide a detailed task description, examples of context and\ncompletion, and single example of context for inference. This regular prompt\nbaseline is referred to as SinglePrompt in this paper. However, for NLP tasks\nwhere each data point for inference is not necessarily lengthy, the token count\nfor instructions and few-shot examples in the prompt may be considerably larger\nthan that of the data point, resulting in lower token-resource utilization\ncompared with encoder-based models like fine-tuned BERT. This cost-efficiency\nissue, affecting inference speed and compute budget, counteracts the many\nbenefits LLMs have to offer. This paper aims to alleviate the preceding problem\nby batching multiple data points into a single prompt, a prompting strategy we\nrefer to as BatchPrompt. This strategy increases the density of data points,\nwhich in turn leads to improved token utilization. Applying BatchPrompt\nnaively, however, is very challenging due to significant performance\ndegradation, as observed in our experiments. We also noticed varying inference\noutcomes for the same data point appearing in different positions within a\nprompt. To address the quality issue while remain high token-resource\nutilization, we introduce Batch Permutation and Ensembling for BatchPrompt, a\nsimple way that recovers labeling quality through majority votes from data\npoints placed in varying positions in a batch at the price of more token usage.\nTo counterbalance the additional token usage caused by the voting process, we\nfurther propose Self-reflection-guided EArly Stopping, which can terminate the\nvoting process early for data points the LLM confidently handles.\n","authors":["Jianzhe Lin","Maurice Diesendruck","Liang Du","Robin Abraham"],"pdf_url":"https://arxiv.org/pdf/2309.00384v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.00378v1","updated":"2023-09-01T10:27:04Z","published":"2023-09-01T10:27:04Z","title":"Long-Term Memorability On Advertisements","summary":" Marketers spend billions of dollars on advertisements but to what end? At the\npurchase time, if customers cannot recognize a brand for which they saw an ad,\nthe money spent on the ad is essentially wasted. Despite its importance in\nmarketing, until now, there has been no study on the memorability of ads in the\nML literature. Most studies have been conducted on short-term recall (<5 mins)\non specific content types like object and action videos. On the other hand, the\nadvertising industry only cares about long-term memorability (a few hours or\nlonger), and advertisements are almost always highly multimodal, depicting a\nstory through its different modalities (text, images, and videos). With this\nmotivation, we conduct the first large scale memorability study consisting of\n1203 participants and 2205 ads covering 276 brands. Running statistical tests\nover different participant subpopulations and ad-types, we find many\ninteresting insights into what makes an ad memorable - both content and human\nfactors. For example, we find that brands which use commercials with fast\nmoving scenes are more memorable than those with slower scenes (p=8e-10) and\nthat people who use ad-blockers remember lower number of ads than those who\ndon't (p=5e-3). Further, with the motivation of simulating the memorability of\nmarketing materials for a particular audience, ultimately helping create one,\nwe present a novel model, Sharingan, trained to leverage real-world knowledge\nof LLMs and visual knowledge of visual encoders to predict the memorability of\na content. We test our model on all the prominent memorability datasets in\nliterature (both images and videos) and achieve state of the art across all of\nthem. We conduct extensive ablation studies across memory types, modality,\nbrand, and architectural choices to find insights into what drives memory.\n","authors":["Harini S I","Somesh Singh","Yaman K Singla","Aanisha Bhattacharyya","Veeky Baths","Changyou Chen","Rajiv Ratn Shah","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00368v1","updated":"2023-09-01T09:54:28Z","published":"2023-09-01T09:54:28Z","title":"When Do Discourse Markers Affect Computational Sentence Understanding?","summary":" The capabilities and use cases of automatic natural language processing (NLP)\nhave grown significantly over the last few years. While much work has been\ndevoted to understanding how humans deal with discourse connectives, this\nphenomenon is understudied in computational systems. Therefore, it is important\nto put NLP models under the microscope and examine whether they can adequately\ncomprehend, process, and reason within the complexity of natural language. In\nthis chapter, we introduce the main mechanisms behind automatic sentence\nprocessing systems step by step and then focus on evaluating discourse\nconnective processing. We assess nine popular systems in their ability to\nunderstand English discourse connectives and analyze how context and language\nunderstanding tasks affect their connective comprehension. The results show\nthat NLP systems do not process all discourse connectives equally well and that\nthe computational processing complexity of different connective kinds is not\nalways consistently in line with the presumed complexity order found in human\nprocessing. In addition, while humans are more inclined to be influenced during\nthe reading procedure but not necessarily in the final comprehension\nperformance, discourse connectives have a significant impact on the final\naccuracy of NLP systems. The richer knowledge of connectives a system learns,\nthe more negative effect inappropriate connectives have on it. This suggests\nthat the correct explicitation of discourse connectives is important for\ncomputational natural language processing.\n","authors":["Ruiqi Li","Liesbeth Allein","Damien Sileo","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2309.00368v1.pdf","comment":"Chapter 7 of Discourse Markers in Interaction, published in Trends in\n Linguistics. Studies and Monographs"},{"id":"http://arxiv.org/abs/2309.00359v1","updated":"2023-09-01T09:34:49Z","published":"2023-09-01T09:34:49Z","title":"Large Content And Behavior Models To Understand, Simulate, And Optimize\n Content And Behavior","summary":" Shannon, in his seminal paper introducing information theory, divided the\ncommunication into three levels: technical, semantic, and effectivenss. While\nthe technical level is concerned with accurate reconstruction of transmitted\nsymbols, the semantic and effectiveness levels deal with the inferred meaning\nand its effect on the receiver. Thanks to telecommunications, the first level\nproblem has produced great advances like the internet. Large Language Models\n(LLMs) make some progress towards the second goal, but the third level still\nremains largely untouched. The third problem deals with predicting and\noptimizing communication for desired receiver behavior. LLMs, while showing\nwide generalization capabilities across a wide range of tasks, are unable to\nsolve for this. One reason for the underperformance could be a lack of\n\"behavior tokens\" in LLMs' training corpora. Behavior tokens define receiver\nbehavior over a communication, such as shares, likes, clicks, purchases,\nretweets, etc. While preprocessing data for LLM training, behavior tokens are\noften removed from the corpora as noise. Therefore, in this paper, we make some\ninitial progress towards reintroducing behavior tokens in LLM training. The\ntrained models, other than showing similar performance to LLMs on content\nunderstanding tasks, show generalization capabilities on behavior simulation,\ncontent simulation, behavior understanding, and behavior domain adaptation.\nUsing a wide range of tasks on two corpora, we show results on all these\ncapabilities. We call these models Large Content and Behavior Models (LCBMs).\nFurther, to spur more research on LCBMs, we release our new Content Behavior\nCorpus (CBC), a repository containing communicator, message, and corresponding\nreceiver behavior.\n","authors":["Ashmit Khandelwal","Aditya Agrawal","Aanisha Bhattacharyya","Yaman K Singla","Somesh Singh","Uttaran Bhattacharya","Ishita Dasgupta","Stefano Petrangeli","Rajiv Ratn Shah","Changyou Chen","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07863v3","updated":"2023-09-01T07:58:52Z","published":"2023-02-15T18:55:29Z","title":"Speculative Decoding with Big Little Decoder","summary":" The recent emergence of Large Language Models based on the Transformer\narchitecture has enabled dramatic advancements in the field of Natural Language\nProcessing. However, these models have long inference latency, which limits\ntheir deployment, and which makes them prohibitively expensive for various\nreal-time applications. The inference latency is further exacerbated by\nautoregressive generative tasks, as models need to run iteratively to generate\ntokens sequentially without leveraging token-level parallelization. To address\nthis, we propose Big Little Decoder (BiLD), a framework that can improve\ninference efficiency and latency for a wide range of text generation\napplications. The BiLD framework contains two models with different sizes that\ncollaboratively generate text. The small model runs autoregressively to\ngenerate text with a low inference cost, and the large model is only invoked\noccasionally to refine the small model's inaccurate predictions in a\nnon-autoregressive manner. To coordinate the small and large models, BiLD\nintroduces two simple yet effective policies: (1) the fallback policy that\ndetermines when to hand control over to the large model; and (2) the rollback\npolicy that determines when the large model needs to correct the small model's\ninaccurate predictions. To evaluate our framework across different tasks and\nmodels, we apply BiLD to various text generation scenarios encompassing machine\ntranslation on IWSLT 2017 De-En and WMT 2014 De-En, and summarization on XSUM\nand CNN/DailyMail. On an NVIDIA T4 GPU, our framework achieves a speedup of up\nto 2.12x speedup with minimal generation quality degradation. Furthermore, our\nframework is fully plug-and-play and can be applied without any modifications\nin the training process or model architecture. Our code is open-sourced\n","authors":["Sehoon Kim","Karttikeya Mangalam","Suhong Moon","John Canny","Jitendra Malik","Michael W. Mahoney","Amir Gholami","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2302.07863v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00312v1","updated":"2023-09-01T07:53:28Z","published":"2023-09-01T07:53:28Z","title":"Comparative Topic Modeling for Determinants of Divergent Report Results\n Applied to Macular Degeneration Studies","summary":" Topic modeling and text mining are subsets of Natural Language Processing\nwith relevance for conducting meta-analysis (MA) and systematic review (SR).\nFor evidence synthesis, the above NLP methods are conventionally used for\ntopic-specific literature searches or extracting values from reports to\nautomate essential phases of SR and MA. Instead, this work proposes a\ncomparative topic modeling approach to analyze reports of contradictory results\non the same general research question. Specifically, the objective is to find\ntopics exhibiting distinct associations with significant results for an outcome\nof interest by ranking them according to their proportional occurrence and\nconsistency of distribution across reports of significant results. The proposed\nmethod was tested on broad-scope studies addressing whether supplemental\nnutritional compounds significantly benefit macular degeneration (MD). Eight\ncompounds were identified as having a particular association with reports of\nsignificant results for benefitting MD. Six of these were further supported in\nterms of effectiveness upon conducting a follow-up literature search for\nvalidation (omega-3 fatty acids, copper, zeaxanthin, lutein, zinc, and\nnitrates). The two not supported by the follow-up literature search (niacin and\nmolybdenum) also had the lowest scores under the proposed methods ranking\nsystem, suggesting that the proposed method's score for a given topic is a\nviable proxy for its degree of association with the outcome of interest. These\nresults underpin the proposed methods potential to add specificity in\nunderstanding effects from broad-scope reports, elucidate topics of interest\nfor future research, and guide evidence synthesis in a systematic and scalable\nway.\n","authors":["Lucas Cassiel Jacaruso"],"pdf_url":"https://arxiv.org/pdf/2309.00312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11259v4","updated":"2023-09-01T07:50:44Z","published":"2023-01-26T17:52:56Z","title":"Domain-Agnostic Molecular Generation with Self-feedback","summary":" The generation of molecules with desired properties has gained tremendous\npopularity, revolutionizing the way scientists design molecular structures and\nproviding valuable support for chemical and drug design. However, despite the\npotential of language models in molecule generation, they face numerous\nchallenges such as the generation of syntactically or chemically flawed\nmolecules, narrow domain focus, and limitations in creating diverse and\ndirectionally feasible molecules due to a dearth of annotated data or external\nmolecular databases. To this end, we introduce MolGen, a pre-trained molecular\nlanguage model tailored specifically for molecule generation. MolGen acquires\nintrinsic structural and grammatical insights by reconstructing over 100\nmillion molecular SELFIES, while facilitating knowledge transfer between\ndifferent domains through domain-agnostic molecular prefix tuning. Moreover, we\npresent a self-feedback paradigm that inspires the pre-trained model to align\nwith the ultimate goal of producing molecules with desirable properties.\nExtensive experiments on well-known benchmarks confirm MolGen's optimization\ncapabilities, encompassing penalized logP, QED, and molecular docking\nproperties. Further analysis shows that MolGen can accurately capture molecule\ndistributions, implicitly learn their structural characteristics, and\nefficiently explore chemical space. The pre-trained model, codes, and datasets\nare publicly available for future research at https://github.com/zjunlp/MolGen.\n","authors":["Yin Fang","Ningyu Zhang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.11259v4.pdf","comment":"Work in progress. Add results of binding affinity"},{"id":"http://arxiv.org/abs/2309.00284v1","updated":"2023-09-01T06:40:41Z","published":"2023-09-01T06:40:41Z","title":"Enhancing the vocal range of single-speaker singing voice synthesis with\n melody-unsupervised pre-training","summary":" The single-speaker singing voice synthesis (SVS) usually underperforms at\npitch values that are out of the singer's vocal range or associated with\nlimited training samples. Based on our previous work, this work proposes a\nmelody-unsupervised multi-speaker pre-training method conducted on a\nmulti-singer dataset to enhance the vocal range of the single-speaker, while\nnot degrading the timbre similarity. This pre-training method can be deployed\nto a large-scale multi-singer dataset, which only contains audio-and-lyrics\npairs without phonemic timing information and pitch annotation. Specifically,\nin the pre-training step, we design a phoneme predictor to produce the\nframe-level phoneme probability vectors as the phonemic timing information and\na speaker encoder to model the timbre variations of different singers, and\ndirectly estimate the frame-level f0 values from the audio to provide the pitch\ninformation. These pre-trained model parameters are delivered into the\nfine-tuning step as prior knowledge to enhance the single speaker's vocal\nrange. Moreover, this work also contributes to improving the sound quality and\nrhythm naturalness of the synthesized singing voices. It is the first to\nintroduce a differentiable duration regulator to improve the rhythm naturalness\nof the synthesized voice, and a bi-directional flow model to improve the sound\nquality. Experimental results verify that the proposed SVS system outperforms\nthe baseline on both sound quality and naturalness.\n","authors":["Shaohuan Zhou","Xu Li","Zhiyong Wu","Ying Shan","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2309.00284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00267v1","updated":"2023-09-01T05:53:33Z","published":"2023-09-01T05:53:33Z","title":"RLAIF: Scaling Reinforcement Learning from Human Feedback with AI\n Feedback","summary":" Reinforcement learning from human feedback (RLHF) is effective at aligning\nlarge language models (LLMs) to human preferences, but gathering high quality\nhuman preference labels is a key bottleneck. We conduct a head-to-head\ncomparison of RLHF vs. RL from AI Feedback (RLAIF) - a technique where\npreferences are labeled by an off-the-shelf LLM in lieu of humans, and we find\nthat they result in similar improvements. On the task of summarization, human\nevaluators prefer generations from both RLAIF and RLHF over a baseline\nsupervised fine-tuned model in ~70% of cases. Furthermore, when asked to rate\nRLAIF vs. RLHF summaries, humans prefer both at equal rates. These results\nsuggest that RLAIF can yield human-level performance, offering a potential\nsolution to the scalability limitations of RLHF.\n","authors":["Harrison Lee","Samrat Phatale","Hassan Mansoor","Kellie Lu","Thomas Mesnard","Colton Bishop","Victor Carbune","Abhinav Rastogi"],"pdf_url":"https://arxiv.org/pdf/2309.00267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00254v1","updated":"2023-09-01T05:09:49Z","published":"2023-09-01T05:09:49Z","title":"Why do universal adversarial attacks work on large language models?:\n Geometry might be the answer","summary":" Transformer based large language models with emergent capabilities are\nbecoming increasingly ubiquitous in society. However, the task of understanding\nand interpreting their internal workings, in the context of adversarial\nattacks, remains largely unsolved. Gradient-based universal adversarial attacks\nhave been shown to be highly effective on large language models and potentially\ndangerous due to their input-agnostic nature. This work presents a novel\ngeometric perspective explaining universal adversarial attacks on large\nlanguage models. By attacking the 117M parameter GPT-2 model, we find evidence\nindicating that universal adversarial triggers could be embedding vectors which\nmerely approximate the semantic information in their adversarial training\nregion. This hypothesis is supported by white-box model analysis comprising\ndimensionality reduction and similarity measurement of hidden representations.\nWe believe this new geometric perspective on the underlying mechanism driving\nuniversal attacks could help us gain deeper insight into the internal workings\nand failure modes of LLMs, thus enabling their mitigation.\n","authors":["Varshini Subhash","Anna Bialas","Weiwei Pan","Finale Doshi-Velez"],"pdf_url":"https://arxiv.org/pdf/2309.00254v1.pdf","comment":"2nd AdvML Frontiers Workshop at 40th International Conference on\n Machine Learning, Honolulu, Hawaii, USA, 2023"},{"id":"http://arxiv.org/abs/2211.13854v2","updated":"2023-09-01T05:07:18Z","published":"2022-11-25T01:37:48Z","title":"ComCLIP: Training-Free Compositional Image and Text Matching","summary":" Contrastive Language-Image Pretraining (CLIP) has demonstrated great\nzero-shot performance for matching images and text. However, it is still\nchallenging to adapt vision-lanaguage pretrained models like CLIP to\ncompositional image and text matching -- a more challenging image and text\nmatching task requiring the model understanding of compositional word concepts\nand visual components. Towards better compositional generalization in zero-shot\nimage and text matching, in this paper, we study the problem from a causal\nperspective: the erroneous semantics of individual entities are essentially\nconfounders that cause the matching failure. Therefore, we propose a novel\n\\textbf{\\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP\ndisentangles input images into subjects, objects, and action sub-images and\ncomposes CLIP's vision encoder and text encoder to perform evolving matching\nover compositional text embedding and sub-image embeddings. In this way,\nComCLIP can mitigate spurious correlations introduced by the pretrained CLIP\nmodels and dynamically evaluate the importance of each component. Experiments\non four compositional image-text matching datasets: SVO, ComVG, Winoground, and\nVL-checklist, and two general image-text retrieval datasets: Flick30K, and\nMSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts\nthe \\textbf{\\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even\nwithout further training or fine-tuning.\n","authors":["Kenan Jiang","Xuehai He","Ruize Xu","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2211.13854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11991v2","updated":"2023-09-01T04:52:38Z","published":"2023-07-22T06:21:41Z","title":"Psy-LLM: Scaling up Global Mental Health Psychological Services with\n AI-based Large Language Models","summary":" The demand for psychological counselling has grown significantly in recent\nyears, particularly with the global outbreak of COVID-19, which has heightened\nthe need for timely and professional mental health support. Online\npsychological counselling has emerged as the predominant mode of providing\nservices in response to this demand. In this study, we propose the Psy-LLM\nframework, an AI-based assistive tool leveraging Large Language Models (LLMs)\nfor question-answering in psychological consultation settings to ease the\ndemand for mental health professions. Our framework combines pre-trained LLMs\nwith real-world professional Q\\&A from psychologists and extensively crawled\npsychological articles. The Psy-LLM framework serves as a front-end tool for\nhealthcare professionals, allowing them to provide immediate responses and\nmindfulness activities to alleviate patient stress. Additionally, it functions\nas a screening tool to identify urgent cases requiring further assistance. We\nevaluated the framework using intrinsic metrics, such as perplexity, and\nextrinsic evaluation metrics, with human participant assessments of response\nhelpfulness, fluency, relevance, and logic. The results demonstrate the\neffectiveness of the Psy-LLM framework in generating coherent and relevant\nanswers to psychological questions. This article discusses the potential and\nlimitations of using large language models to enhance mental health support\nthrough AI technologies.\n","authors":["Tin Lai","Yukun Shi","Zicong Du","Jiajie Wu","Ken Fu","Yichao Dou","Ziqi Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00246v1","updated":"2023-09-01T04:30:59Z","published":"2023-09-01T04:30:59Z","title":"Detecting Suicidality in Arabic Tweets Using Machine Learning and Deep\n Learning Techniques","summary":" Social media platforms have revolutionized traditional communication\ntechniques by enabling people globally to connect instantaneously, openly, and\nfrequently. People use social media to share personal stories and express their\nopinion. Negative emotions such as thoughts of death, self-harm, and hardship\nare commonly expressed on social media, particularly among younger generations.\nAs a result, using social media to detect suicidal thoughts will help provide\nproper intervention that will ultimately deter others from self-harm and\ncommitting suicide and stop the spread of suicidal ideation on social media. To\ninvestigate the ability to detect suicidal thoughts in Arabic tweets\nautomatically, we developed a novel Arabic suicidal tweets dataset, examined\nseveral machine learning models, including Na\\\"ive Bayes, Support Vector\nMachine, K-Nearest Neighbor, Random Forest, and XGBoost, trained on word\nfrequency and word embedding features, and investigated the ability of\npre-trained deep learning models, AraBert, AraELECTRA, and AraGPT2, to identify\nsuicidal thoughts in Arabic tweets. The results indicate that SVM and RF models\ntrained on character n-gram features provided the best performance in the\nmachine learning models, with 86% accuracy and an F1 score of 79%. The results\nof the deep learning models show that AraBert model outperforms other machine\nand deep learning models, achieving an accuracy of 91\\% and an F1-score of 88%,\nwhich significantly improves the detection of suicidal ideation in the Arabic\ntweets dataset. To the best of our knowledge, this is the first study to\ndevelop an Arabic suicidality detection dataset from Twitter and to use\ndeep-learning approaches in detecting suicidality in Arabic posts.\n","authors":["Asma Abdulsalam","Areej Alhothali","Saleh Al-Ghamdi"],"pdf_url":"https://arxiv.org/pdf/2309.00246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00244v1","updated":"2023-09-01T04:26:55Z","published":"2023-09-01T04:26:55Z","title":"NeuroSurgeon: A Toolkit for Subnetwork Analysis","summary":" Despite recent advances in the field of explainability, much remains unknown\nabout the algorithms that neural networks learn to represent. Recent work has\nattempted to understand trained models by decomposing them into functional\ncircuits (Csord\\'as et al., 2020; Lepori et al., 2023). To advance this\nresearch, we developed NeuroSurgeon, a python library that can be used to\ndiscover and manipulate subnetworks within models in the Huggingface\nTransformers library (Wolf et al., 2019). NeuroSurgeon is freely available at\nhttps://github.com/mlepori1/NeuroSurgeon.\n","authors":["Michael A. Lepori","Ellie Pavlick","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2309.00244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00240v1","updated":"2023-09-01T04:14:39Z","published":"2023-09-01T04:14:39Z","title":"FactLLaMA: Optimizing Instruction-Following Language Models with\n External Knowledge for Automated Fact-Checking","summary":" Automatic fact-checking plays a crucial role in combating the spread of\nmisinformation. Large Language Models (LLMs) and Instruction-Following\nvariants, such as InstructGPT and Alpaca, have shown remarkable performance in\nvarious natural language processing tasks. However, their knowledge may not\nalways be up-to-date or sufficient, potentially leading to inaccuracies in\nfact-checking. To address this limitation, we propose combining the power of\ninstruction-following language models with external evidence retrieval to\nenhance fact-checking performance. Our approach involves leveraging search\nengines to retrieve relevant evidence for a given input claim. This external\nevidence serves as valuable supplementary information to augment the knowledge\nof the pretrained language model. Then, we instruct-tune an open-sourced\nlanguage model, called LLaMA, using this evidence, enabling it to predict the\nveracity of the input claim more accurately. To evaluate our method, we\nconducted experiments on two widely used fact-checking datasets: RAWFC and\nLIAR. The results demonstrate that our approach achieves state-of-the-art\nperformance in fact-checking tasks. By integrating external evidence, we bridge\nthe gap between the model's knowledge and the most up-to-date and sufficient\ncontext available, leading to improved fact-checking outcomes. Our findings\nhave implications for combating misinformation and promoting the dissemination\nof accurate information on online platforms. Our released materials are\naccessible at: https://thcheung.github.io/factllama.\n","authors":["Tsun-Hin Cheung","Kin-Man Lam"],"pdf_url":"https://arxiv.org/pdf/2309.00240v1.pdf","comment":"Accepted in APSIPA ASC 2023"},{"id":"http://arxiv.org/abs/2309.00238v1","updated":"2023-09-01T04:08:45Z","published":"2023-09-01T04:08:45Z","title":"ALJP: An Arabic Legal Judgment Prediction in Personal Status Cases Using\n Machine Learning Models","summary":" Legal Judgment Prediction (LJP) aims to predict judgment outcomes based on\ncase description. Several researchers have developed techniques to assist\npotential clients by predicting the outcome in the legal profession. However,\nnone of the proposed techniques were implemented in Arabic, and only a few\nattempts were implemented in English, Chinese, and Hindi. In this paper, we\ndevelop a system that utilizes deep learning (DL) and natural language\nprocessing (NLP) techniques to predict the judgment outcome from Arabic case\nscripts, especially in cases of custody and annulment of marriage. This system\nwill assist judges and attorneys in improving their work and time efficiency\nwhile reducing sentencing disparity. In addition, it will help litigants,\nlawyers, and law students analyze the probable outcomes of any given case\nbefore trial. We use a different machine and deep learning models such as\nSupport Vector Machine (SVM), Logistic regression (LR), Long Short Term Memory\n(LSTM), and Bidirectional Long Short-Term Memory (BiLSTM) using representation\ntechniques such as TF-IDF and word2vec on the developed dataset. Experimental\nresults demonstrate that compared with the five baseline methods, the SVM model\nwith word2vec and LR with TF-IDF achieve the highest accuracy of 88% and 78% in\npredicting the judgment on custody cases and annulment of marriage,\nrespectively. Furthermore, the LR and SVM with word2vec and BiLSTM model with\nTF-IDF achieved the highest accuracy of 88% and 69% in predicting the\nprobability of outcomes on custody cases and annulment of marriage,\nrespectively.\n","authors":["Salwa Abbara","Mona Hafez","Aya Kazzaz","Areej Alhothali","Alhanouf Alsolami"],"pdf_url":"https://arxiv.org/pdf/2309.00238v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00237v1","updated":"2023-09-01T04:01:20Z","published":"2023-09-01T04:01:20Z","title":"Publicly Shareable Clinical Large Language Model Built on Synthetic\n Clinical Notes","summary":" The development of large language models tailored for handling patients'\nclinical notes is often hindered by the limited accessibility and usability of\nthese notes due to strict privacy regulations. To address these challenges, we\nfirst create synthetic large-scale clinical notes using publicly available case\nreports extracted from biomedical literature. We then use these synthetic notes\nto train our specialized clinical large language model, Asclepius. While\nAsclepius is trained on synthetic data, we assess its potential performance in\nreal-world applications by evaluating it using real clinical notes. We\nbenchmark Asclepius against several other large language models, including\nGPT-3.5-turbo and other open-source alternatives. To further validate our\napproach using synthetic notes, we also compare Asclepius with its variants\ntrained on real clinical notes. Our findings convincingly demonstrate that\nsynthetic clinical notes can serve as viable substitutes for real ones when\nconstructing high-performing clinical language models. This conclusion is\nsupported by detailed evaluations conducted by both GPT-4 and medical\nprofessionals. All resources including weights, codes, and data used in the\ndevelopment of Asclepius are made publicly accessible for future research.\n","authors":["Sunjun Kweon","Junu Kim","Jiyoun Kim","Sujeong Im","Eunbyeol Cho","Seongsu Bae","Jungwoo Oh","Gyubok Lee","Jong Hak Moon","Seng Chan You","Seungjin Baek","Chang Hoon Han","Yoon Bin Jung","Yohan Jo","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2309.00237v1.pdf","comment":"https://github.com/starmpcc/Asclepius"},{"id":"http://arxiv.org/abs/2309.00236v1","updated":"2023-09-01T03:53:40Z","published":"2023-09-01T03:53:40Z","title":"Image Hijacking: Adversarial Images can Control Generative Models at\n Runtime","summary":" Are foundation models secure from malicious actors? In this work, we focus on\nthe image input to a vision-language model (VLM). We discover image hijacks,\nadversarial images that control generative models at runtime. We introduce\nBehavior Matching, a general method for creating image hijacks, and we use it\nto explore three types of attacks. Specific string attacks generate arbitrary\noutput of the adversary's choosing. Leak context attacks leak information from\nthe context window into the output. Jailbreak attacks circumvent a model's\nsafety training. We study these attacks against LLaVA-2, a state-of-the-art VLM\nbased on CLIP and LLaMA-2, and find that all our attack types have above a 90\\%\nsuccess rate. Moreover, our attacks are automated and require only small image\nperturbations. These findings raise serious concerns about the security of\nfoundation models. If image hijacks are as difficult to defend against as\nadversarial examples in CIFAR-10, then it might be many years before a solution\nis found -- if it even exists.\n","authors":["Luke Bailey","Euan Ong","Stuart Russell","Scott Emmons"],"pdf_url":"https://arxiv.org/pdf/2309.00236v1.pdf","comment":"Code is available at https://github.com/euanong/image-hijacks"},{"id":"http://arxiv.org/abs/2309.00230v1","updated":"2023-09-01T03:19:53Z","published":"2023-09-01T03:19:53Z","title":"JoTR: A Joint Transformer and Reinforcement Learning Framework for\n Dialog Policy Learning","summary":" Dialogue policy learning (DPL) is a crucial component of dialogue modelling.\nIts primary role is to determine the appropriate abstract response, commonly\nreferred to as the \"dialogue action\". Traditional DPL methodologies have\ntreated this as a sequential decision problem, using pre-defined action\ncandidates extracted from a corpus. However, these incomplete candidates can\nsignificantly limit the diversity of responses and pose challenges when dealing\nwith edge cases, which are scenarios that occur only at extreme operating\nparameters. To address these limitations, we introduce a novel framework, JoTR.\nThis framework is unique as it leverages a text-to-text Transformer-based model\nto generate flexible dialogue actions. Unlike traditional methods, JoTR\nformulates a word-level policy that allows for a more dynamic and adaptable\ndialogue action generation, without the need for any action templates. This\nsetting enhances the diversity of responses and improves the system's ability\nto handle edge cases effectively. In addition, JoTR employs reinforcement\nlearning with a reward-shaping mechanism to efficiently finetune the word-level\ndialogue policy, which allows the model to learn from its interactions,\nimproving its performance over time. We conducted an extensive evaluation of\nJoTR to assess its effectiveness. Our extensive evaluation shows that JoTR\nachieves state-of-the-art performance on two benchmark dialogue modelling\ntasks, as assessed by both user simulators and human evaluators.\n","authors":["Wai-Chung Kwan","Huimin Wang","Hongru Wang","Zezhong Wang","Xian Wu","Yefeng Zheng","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2309.00230v1.pdf","comment":"Our code, models and other related resources are publicly available\n at https://github.com/KwanWaiChung/JoTR"},{"id":"http://arxiv.org/abs/2309.00223v1","updated":"2023-09-01T02:56:20Z","published":"2023-09-01T02:56:20Z","title":"The FruitShell French synthesis system at the Blizzard 2023 Challenge","summary":" This paper presents a French text-to-speech synthesis system for the Blizzard\nChallenge 2023. The challenge consists of two tasks: generating high-quality\nspeech from female speakers and generating speech that closely resembles\nspecific individuals. Regarding the competition data, we conducted a screening\nprocess to remove missing or erroneous text data. We organized all symbols\nexcept for phonemes and eliminated symbols that had no pronunciation or zero\nduration. Additionally, we added word boundary and start/end symbols to the\ntext, which we have found to improve speech quality based on our previous\nexperience. For the Spoke task, we performed data augmentation according to the\ncompetition rules. We used an open-source G2P model to transcribe the French\ntexts into phonemes. As the G2P model uses the International Phonetic Alphabet\n(IPA), we applied the same transcription process to the provided competition\ndata for standardization. However, due to compiler limitations in recognizing\nspecial symbols from the IPA chart, we followed the rules to convert all\nphonemes into the phonetic scheme used in the competition data. Finally, we\nresampled all competition audio to a uniform sampling rate of 16 kHz. We\nemployed a VITS-based acoustic model with the hifigan vocoder. For the Spoke\ntask, we trained a multi-speaker model and incorporated speaker information\ninto the duration predictor, vocoder, and flow layers of the model. The\nevaluation results of our system showed a quality MOS score of 3.6 for the Hub\ntask and 3.4 for the Spoke task, placing our system at an average level among\nall participating teams.\n","authors":["Xin Qi","Xiaopeng Wang","Zhiyong Wang","Wang Liu","Mingming Ding","Shuchen Shi"],"pdf_url":"https://arxiv.org/pdf/2309.00223v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00215v1","updated":"2023-09-01T02:19:41Z","published":"2023-09-01T02:19:41Z","title":"Towards Addressing the Misalignment of Object Proposal Evaluation for\n Vision-Language Tasks via Semantic Grounding","summary":" Object proposal generation serves as a standard pre-processing step in\nVision-Language (VL) tasks (image captioning, visual question answering, etc.).\nThe performance of object proposals generated for VL tasks is currently\nevaluated across all available annotations, a protocol that we show is\nmisaligned - higher scores do not necessarily correspond to improved\nperformance on downstream VL tasks. Our work serves as a study of this\nphenomenon and explores the effectiveness of semantic grounding to mitigate its\neffects. To this end, we propose evaluating object proposals against only a\nsubset of available annotations, selected by thresholding an annotation\nimportance score. Importance of object annotations to VL tasks is quantified by\nextracting relevant semantic information from text describing the image. We\nshow that our method is consistent and demonstrates greatly improved alignment\nwith annotations selected by image captioning metrics and human annotation when\ncompared against existing techniques. Lastly, we compare current detectors used\nin the Scene Graph Generation (SGG) benchmark as a use case, which serves as an\nexample of when traditional object proposal evaluation techniques are\nmisaligned.\n","authors":["Joshua Feinglass","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2309.00215v1.pdf","comment":"Accepted to WACV 2024 (Round 1)"},{"id":"http://arxiv.org/abs/2309.00208v1","updated":"2023-09-01T01:51:28Z","published":"2023-09-01T01:51:28Z","title":"Large Language Models for Semantic Monitoring of Corporate Disclosures:\n A Case Study on Korea's Top 50 KOSPI Companies","summary":" In the rapidly advancing domain of artificial intelligence, state-of-the-art\nlanguage models such as OpenAI's GPT-3.5-turbo and GPT-4 offer unprecedented\nopportunities for automating complex tasks. This research paper delves into the\ncapabilities of these models for semantically analyzing corporate disclosures\nin the Korean context, specifically for timely disclosure. The study focuses on\nthe top 50 publicly traded companies listed on the Korean KOSPI, based on\nmarket capitalization, and scrutinizes their monthly disclosure summaries over\na period of 17 months. Each summary was assigned a sentiment rating on a scale\nranging from 1(very negative) to 5(very positive). To gauge the effectiveness\nof the language models, their sentiment ratings were compared with those\ngenerated by human experts. Our findings reveal a notable performance disparity\nbetween GPT-3.5-turbo and GPT-4, with the latter demonstrating significant\naccuracy in human evaluation tests. The Spearman correlation coefficient was\nregistered at 0.61, while the simple concordance rate was recorded at 0.82.\nThis research contributes valuable insights into the evaluative characteristics\nof GPT models, thereby laying the groundwork for future innovations in the\nfield of automated semantic monitoring.\n","authors":["Junwon Sung","Woojin Heo","Yunkyung Byun","Youngsam Kim"],"pdf_url":"https://arxiv.org/pdf/2309.00208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09862v2","updated":"2023-09-01T00:37:43Z","published":"2023-08-19T00:39:21Z","title":"Breaking Language Barriers: A Question Answering Dataset for Hindi and\n Marathi","summary":" The recent advances in deep-learning have led to the development of highly\nsophisticated systems with an unquenchable appetite for data. On the other\nhand, building good deep-learning models for low-resource languages remains a\nchallenging task. This paper focuses on developing a Question Answering dataset\nfor two such languages- Hindi and Marathi. Despite Hindi being the 3rd most\nspoken language worldwide, with 345 million speakers, and Marathi being the\n11th most spoken language globally, with 83.2 million speakers, both languages\nface limited resources for building efficient Question Answering systems. To\ntackle the challenge of data scarcity, we have developed a novel approach for\ntranslating the SQuAD 2.0 dataset into Hindi and Marathi. We release the\nlargest Question-Answering dataset available for these languages, with each\ndataset containing 28,000 samples. We evaluate the dataset on various\narchitectures and release the best-performing models for both Hindi and\nMarathi, which will facilitate further research in these languages. Leveraging\nsimilarity tools, our method holds the potential to create datasets in diverse\nlanguages, thereby enhancing the understanding of natural language across\nvaried linguistic contexts. Our fine-tuned models, code, and dataset will be\nmade publicly available.\n","authors":["Maithili Sabane","Onkar Litake","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2308.09862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00180v1","updated":"2023-09-01T00:14:51Z","published":"2023-09-01T00:14:51Z","title":"Exploring the law of text geographic information","summary":" Textual geographic information is indispensable and heavily relied upon in\npractical applications. The absence of clear distribution poses challenges in\neffectively harnessing geographic information, thereby driving our quest for\nexploration. We contend that geographic information is influenced by human\nbehavior, cognition, expression, and thought processes, and given our intuitive\nunderstanding of natural systems, we hypothesize its conformity to the Gamma\ndistribution. Through rigorous experiments on a diverse range of 24 datasets\nencompassing different languages and types, we have substantiated this\nhypothesis, unearthing the underlying regularities governing the dimensions of\nquantity, length, and distance in geographic information. Furthermore,\ntheoretical analyses and comparisons with Gaussian distributions and Zipf's law\nhave refuted the contingency of these laws. Significantly, we have estimated\nthe upper bounds of human utilization of geographic information, pointing\ntowards the existence of uncharted territories. Also, we provide guidance in\ngeographic information extraction. Hope we peer its true countenance uncovering\nthe veil of geographic information.\n","authors":["Zhenhua Wang","Daiyu Zhang","Ming Ren","Guang Xu"],"pdf_url":"https://arxiv.org/pdf/2309.00180v1.pdf","comment":"IPM"},{"id":"http://arxiv.org/abs/2309.00178v1","updated":"2023-09-01T00:11:56Z","published":"2023-09-01T00:11:56Z","title":"Will Sentiment Analysis Need Subculture? A New Data Augmentation\n Approach","summary":" The renowned proverb that \"The pen is mightier than the sword\" underscores\nthe formidable influence wielded by text expressions in shaping sentiments.\nIndeed, well-crafted written can deeply resonate within cultures, conveying\nprofound sentiments. Nowadays, the omnipresence of the Internet has fostered a\nsubculture that congregates around the contemporary milieu. The subculture\nartfully articulates the intricacies of human feelings by ardently pursuing the\nallure of novelty, a fact that cannot be disregarded in the sentiment analysis.\nThis paper strives to enrich data through the lens of subculture, to address\nthe insufficient training data faced by sentiment analysis. To this end, a new\napproach of subculture-based data augmentation (SCDA) is proposed, which\nengenders six enhanced texts for each training text by leveraging the creation\nof six diverse subculture expression generators. The extensive experiments\nattest to the effectiveness and potential of SCDA. The results also shed light\non the phenomenon that disparate subculture expressions elicit varying degrees\nof sentiment stimulation. Moreover, an intriguing conjecture arises, suggesting\nthe linear reversibility of certain subculture expressions. It is our fervent\naspiration that this study serves as a catalyst in fostering heightened\nperceptiveness towards the tapestry of information, sentiment and culture,\nthereby enriching our collective understanding.\n","authors":["Zhenhua Wang","Simin He","Guang Xu","Ming Ren"],"pdf_url":"https://arxiv.org/pdf/2309.00178v1.pdf","comment":"JASIST"},{"id":"http://arxiv.org/abs/2205.12636v3","updated":"2023-09-01T00:09:09Z","published":"2022-05-25T10:22:14Z","title":"A Zipf's Law-based Text Generation Approach for Addressing Imbalance in\n Entity Extraction","summary":" Entity extraction is critical in the intelligent advancement across diverse\ndomains. Nevertheless, a challenge to its effectiveness arises from the data\nimbalance. This paper proposes a novel approach by viewing the issue through\nthe quantitative information, recognizing that entities exhibit certain levels\nof commonality while others are scarce, which can be reflected in the\nquantifiable distribution of words. The Zipf's Law emerges as a well-suited\nadoption, and to transition from words to entities, words within the documents\nare classified as common and rare ones. Subsequently, sentences are classified\ninto common and rare ones, and are further processed by text generation models\naccordingly. Rare entities within the generated sentences are then labeled\nusing human-designed rules, serving as a supplement to the raw dataset, thereby\nmitigating the imbalance problem. The study presents a case of extracting\nentities from technical documents, and experimental results from two datasets\nprove the effectiveness of the proposed method. Furthermore, the significance\nof Zipf's law in driving the progress of AI is discussed, broadening the reach\nand coverage of Informetrics. This paper presents a successful demonstration of\nextending Informetrics to interface with AI through Zipf's Law.\n","authors":["Zhenhua Wang","Ming Ren","Dong Gao","Zhuang Li"],"pdf_url":"https://arxiv.org/pdf/2205.12636v3.pdf","comment":"Journal of Informetrics"},{"id":"http://arxiv.org/abs/2111.13861v2","updated":"2023-09-01T00:05:04Z","published":"2021-11-27T10:22:29Z","title":"A New Multifractal-based Deep Learning Model for Text Mining","summary":" In this world full of uncertainty, where the fabric of existence weaves\npatterns of complexity, multifractal emerges as beacons of insight,\nilluminating them. As we delve into the realm of text mining that underpins\nvarious natural language processing applications and powers a range of\nintelligent services, we recognize that behind the veil of text lies a\nmanifestation of human thought and cognition, intricately intertwined with the\ncomplexities. Building upon the foundation of perceiving text as a complex\nsystem, this study embarks on a journey to unravel the hidden treasures within,\narmed with the proposed multifractal method that deciphers the multifractal\nattributes embedded within the text landscape. This endeavor culminates in the\nbirth of our novel model, which also harnesses the power of the proposed\nactivation function to facilitate nonlinear information transmission within its\nneural network architecture. The success on experiments anchored in real-world\ntechnical reports covering the extraction of technical term and classification\nof hazard events, stands as a testament to our endeavors. This research venture\nnot only expands our understanding of text mining but also opens new horizons\nfor knowledge discovery across various domains.\n","authors":["Zhenhua Wang","Ming Ren","Dong Gao"],"pdf_url":"https://arxiv.org/pdf/2111.13861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00754v1","updated":"2023-09-01T22:57:20Z","published":"2023-09-01T22:57:20Z","title":"Efficient RLHF: Reducing the Memory Usage of PPO","summary":" Reinforcement Learning with Human Feedback (RLHF) has revolutionized language\nmodeling by aligning models with human preferences. However, the RL stage,\nProximal Policy Optimization (PPO), requires over 3x the memory of Supervised\nFine-Tuning (SFT), making it infeasible to use for most practitioners. To\naddress this issue, we present a comprehensive analysis the memory usage,\nperformance, and training time of memory-savings techniques for PPO. We\nintroduce Hydra-RLHF by first integrating the SFT and Reward models and then\ndynamically turning LoRA \"off\" during training. Our experiments show: 1. Using\nLoRA during PPO reduces its memory usage to be smaller than SFT while improving\nalignment across four public benchmarks, and 2. Hydra-PPO reduces the latency\nper sample of LoRA-PPO by up to 65% while maintaining its performance. Our\nresults demonstrate that Hydra-PPO is a simple and promising solution for\nenabling more widespread usage of RLHF.\n","authors":["Michael Santacroce","Yadong Lu","Han Yu","Yuanzhi Li","Yelong Shen"],"pdf_url":"https://arxiv.org/pdf/2309.00754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00751v1","updated":"2023-09-01T22:26:06Z","published":"2023-09-01T22:26:06Z","title":"Let the Models Respond: Interpreting Language Model Detoxification\n Through the Lens of Prompt Dependence","summary":" Due to language models' propensity to generate toxic or hateful responses,\nseveral techniques were developed to align model generations with users'\npreferences. Despite the effectiveness of such methods in improving the safety\nof model interactions, their impact on models' internal processes is still\npoorly understood. In this work, we apply popular detoxification approaches to\nseveral language models and quantify their impact on the resulting models'\nprompt dependence using feature attribution methods. We evaluate the\neffectiveness of counter-narrative fine-tuning and compare it with\nreinforcement learning-driven detoxification, observing differences in prompt\nreliance between the two methods despite their similar detoxification\nperformances.\n","authors":["Daniel Scalena","Gabriele Sarti","Malvina Nissim","Elisabetta Fersini"],"pdf_url":"https://arxiv.org/pdf/2309.00751v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2307.08153v2","updated":"2023-09-01T22:23:09Z","published":"2023-07-16T21:22:40Z","title":"Analyzing Dataset Annotation Quality Management in the Wild","summary":" Data quality is crucial for training accurate, unbiased, and trustworthy\nmachine learning models and their correct evaluation. Recent works, however,\nhave shown that even popular datasets used to train and evaluate\nstate-of-the-art models contain a non-negligible amount of erroneous\nannotations, bias or annotation artifacts. There exist best practices and\nguidelines regarding annotation projects. But to the best of our knowledge, no\nlarge-scale analysis has been performed as of yet on how quality management is\nactually conducted when creating natural language datasets and whether these\nrecommendations are followed. Therefore, we first survey and summarize\nrecommended quality management practices for dataset creation as described in\nthe literature and provide suggestions on how to apply them. Then, we compile a\ncorpus of 591 scientific publications introducing text datasets and annotate it\nfor quality-related aspects, such as annotator management, agreement,\nadjudication or data validation. Using these annotations, we then analyze how\nquality management is conducted in practice. We find that a majority of the\nannotated publications apply good or very good quality management. However, we\ndeem the effort of 30% of the works as only subpar. Our analysis also shows\ncommon errors, especially with using inter-annotator agreement and computing\nannotation error rates.\n","authors":["Jan-Christoph Klie","Richard Eckart de Castilho","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2307.08153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00743v1","updated":"2023-09-01T21:40:34Z","published":"2023-09-01T21:40:34Z","title":"Language-Conditioned Change-point Detection to Identify Sub-Tasks in\n Robotics Domains","summary":" In this work, we present an approach to identify sub-tasks within a\ndemonstrated robot trajectory using language instructions. We identify these\nsub-tasks using language provided during demonstrations as guidance to identify\nsub-segments of a longer robot trajectory. Given a sequence of natural language\ninstructions and a long trajectory consisting of image frames and discrete\nactions, we want to map an instruction to a smaller fragment of the trajectory.\nUnlike previous instruction following works which directly learn the mapping\nfrom language to a policy, we propose a language-conditioned change-point\ndetection method to identify sub-tasks in a problem. Our approach learns the\nrelationship between constituent segments of a long language command and\ncorresponding constituent segments of a trajectory. These constituent\ntrajectory segments can be used to learn subtasks or sub-goals for planning or\noptions as demonstrated by previous related work. Our insight in this work is\nthat the language-conditioned robot change-point detection problem is similar\nto the existing video moment retrieval works used to identify sub-segments\nwithin online videos. Through extensive experimentation, we demonstrate a\n$1.78_{\\pm 0.82}\\%$ improvement over a baseline approach in accurately\nidentifying sub-tasks within a trajectory using our proposed method. Moreover,\nwe present a comprehensive study investigating sample complexity requirements\non learning this mapping, between language and trajectory sub-segments, to\nunderstand if the video retrieval-based methods are realistic in real robot\nscenarios.\n","authors":["Divyanshu Raj","Chitta Baral","Nakul Gopalan"],"pdf_url":"https://arxiv.org/pdf/2309.00743v1.pdf","comment":"9 Pages, 13 figures, Accepted paper at the RSS 2023 Workshop on\n Articulate Robots: Utilizing Language for Robot Learning"},{"id":"http://arxiv.org/abs/2302.07189v4","updated":"2023-09-01T20:32:05Z","published":"2023-02-14T17:00:06Z","title":"Reveal the Unknown: Out-of-Knowledge-Base Mention Discovery with Entity\n Linking","summary":" Discovering entity mentions that are out of a Knowledge Base (KB) from texts\nplays a critical role in KB maintenance, but has not yet been fully explored.\nThe current methods are mostly limited to the simple threshold-based approach\nand feature-based classification, and the datasets for evaluation are\nrelatively rare. We propose BLINKout, a new BERT-based Entity Linking (EL)\nmethod which can identify mentions that do not have corresponding KB entities\nby matching them to a special NIL entity. To better utilize BERT, we propose\nnew techniques including NIL entity representation and classification, with\nsynonym enhancement. We also apply KB Pruning and Versioning strategies to\nautomatically construct out-of-KB datasets from common in-KB EL datasets.\nResults on five datasets of clinical notes, biomedical publications, and\nWikipedia articles in various domains show the advantages of BLINKout over\nexisting methods to identify out-of-KB mentions for the medical ontologies,\nUMLS, SNOMED CT, and the general KB, WikiData.\n","authors":["Hang Dong","Jiaoyan Chen","Yuan He","Yinan Liu","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2302.07189v4.pdf","comment":"11 pages, 3 figures, accepted for CIKM 2023"},{"id":"http://arxiv.org/abs/2309.00723v1","updated":"2023-09-01T20:15:48Z","published":"2023-09-01T20:15:48Z","title":"Contextual Biasing of Named-Entities with Large Language Models","summary":" This paper studies contextual biasing with Large Language Models (LLMs),\nwhere during second-pass rescoring additional contextual information is\nprovided to a LLM to boost Automatic Speech Recognition (ASR) performance. We\npropose to leverage prompts for a LLM without fine tuning during rescoring\nwhich incorporate a biasing list and few-shot examples to serve as additional\ninformation when calculating the score for the hypothesis. In addition to\nfew-shot prompt learning, we propose multi-task training of the LLM to predict\nboth the entity class and the next token. To improve the efficiency for\ncontextual biasing and to avoid exceeding LLMs' maximum sequence lengths, we\npropose dynamic prompting, where we select the most likely class using the\nclass tag prediction, and only use entities in this class as contexts for next\ntoken prediction. Word Error Rate (WER) evaluation is performed on i) an\ninternal calling, messaging, and dictation dataset, and ii) the SLUE-Voxpopuli\ndataset. Results indicate that biasing lists and few-shot examples can achieve\n17.8% and 9.6% relative improvement compared to first pass ASR, and that\nmulti-task training and dynamic prompting can achieve 20.0% and 11.3% relative\nWER improvement, respectively.\n","authors":["Chuanneng Sun","Zeeshan Ahmed","Yingyi Ma","Zhe Liu","Yutong Pang","Ozlem Kalinli"],"pdf_url":"https://arxiv.org/pdf/2309.00723v1.pdf","comment":"5 pages, 4 figures. Conference: ICASSP 2024"},{"id":"http://arxiv.org/abs/2306.09237v2","updated":"2023-09-01T18:00:57Z","published":"2023-06-15T16:19:15Z","title":"SCALE: Scaling up the Complexity for Advanced Language Model Evaluation","summary":" Recent strides in Large Language Models (LLMs) have saturated many NLP\nbenchmarks (even professional domain-specific ones), emphasizing the need for\nnovel, more challenging novel ones to properly assess LLM capabilities. In this\npaper, we introduce a novel NLP benchmark that poses challenges to current LLMs\nacross four key dimensions: processing long documents (up to 50K tokens),\nutilizing domain specific knowledge (embodied in legal texts), multilingual\nunderstanding (covering five languages), and multitasking (comprising legal\ndocument to document Information Retrieval, Court View Generation, Leading\nDecision Summarization, Citation Extraction, and eight challenging Text\nClassification tasks). Our benchmark comprises diverse legal NLP datasets from\nthe Swiss legal system, allowing for a comprehensive study of the underlying\nNon-English, inherently multilingual, federal legal system. Despite recent\nadvances, efficiently processing long documents for intense review/analysis\ntasks remains an open challenge for language models. Also, comprehensive,\ndomain-specific benchmarks requiring high expertise to develop are rare, as are\nmultilingual benchmarks. This scarcity underscores our contribution's value,\nconsidering most public models are trained predominantly on English corpora,\nwhile other languages remain understudied, particularly for practical\ndomain-specific NLP tasks. Our benchmark allows for testing and advancing the\nstate-of-the-art LLMs. As part of our study, we evaluate several pre-trained\nmultilingual language models on our benchmark to establish strong baselines as\na point of reference. Despite the large size of our datasets (tens to hundreds\nof thousands of examples), existing publicly available models struggle with\nmost tasks, even after in-domain pretraining. We publish all resources\n(benchmark suite, pre-trained models, code) under a fully permissive open CC\nBY-SA license.\n","authors":["Vishvaksenan Rasiah","Ronja Stern","Veton Matoshi","Matthias Stürmer","Ilias Chalkidis","Daniel E. Ho","Joel Niklaus"],"pdf_url":"https://arxiv.org/pdf/2306.09237v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00667v1","updated":"2023-09-01T17:27:37Z","published":"2023-09-01T17:27:37Z","title":"Taken out of context: On measuring situational awareness in LLMs","summary":" We aim to better understand the emergence of `situational awareness' in large\nlanguage models (LLMs). A model is situationally aware if it's aware that it's\na model and can recognize whether it's currently in testing or deployment.\nToday's LLMs are tested for safety and alignment before they are deployed. An\nLLM could exploit situational awareness to achieve a high score on safety\ntests, while taking harmful actions after deployment. Situational awareness may\nemerge unexpectedly as a byproduct of model scaling. One way to better foresee\nthis emergence is to run scaling experiments on abilities necessary for\nsituational awareness. As such an ability, we propose `out-of-context\nreasoning' (in contrast to in-context learning). We study out-of-context\nreasoning experimentally. First, we finetune an LLM on a description of a test\nwhile providing no examples or demonstrations. At test time, we assess whether\nthe model can pass the test. To our surprise, we find that LLMs succeed on this\nout-of-context reasoning task. Their success is sensitive to the training setup\nand only works when we apply data augmentation. For both GPT-3 and LLaMA-1,\nperformance improves with model size. These findings offer a foundation for\nfurther empirical study, towards predicting and potentially controlling the\nemergence of situational awareness in LLMs. Code is available at:\nhttps://github.com/AsaCooperStickland/situational-awareness-evals.\n","authors":["Lukas Berglund","Asa Cooper Stickland","Mikita Balesni","Max Kaufmann","Meg Tong","Tomasz Korbak","Daniel Kokotajlo","Owain Evans"],"pdf_url":"https://arxiv.org/pdf/2309.00667v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.00616v1","updated":"2023-09-01T17:59:56Z","published":"2023-09-01T17:59:56Z","title":"OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation","summary":" Current 3D open-vocabulary scene understanding methods mostly utilize\nwell-aligned 2D images as the bridge to learn 3D features with language.\nHowever, applying these approaches becomes challenging in scenarios where 2D\nimages are absent. In this work, we introduce a completely new pipeline,\nnamely, OpenIns3D, which requires no 2D image inputs, for 3D open-vocabulary\nscene understanding at the instance level. The OpenIns3D framework employs a\n\"Mask-Snap-Lookup\" scheme. The \"Mask\" module learns class-agnostic mask\nproposals in 3D point clouds. The \"Snap\" module generates synthetic scene-level\nimages at multiple scales and leverages 2D vision language models to extract\ninteresting objects. The \"Lookup\" module searches through the outcomes of\n\"Snap\" with the help of Mask2Pixel maps, which contain the precise\ncorrespondence between 3D masks and synthetic images, to assign category names\nto the proposed masks. This 2D input-free, easy-to-train, and flexible approach\nachieved state-of-the-art results on a wide range of indoor and outdoor\ndatasets with a large margin. Furthermore, OpenIns3D allows for effortless\nswitching of 2D detectors without re-training. When integrated with\nstate-of-the-art 2D open-world models such as ODISE and GroundingDINO, superb\nresults are observed on open-vocabulary instance segmentation. When integrated\nwith LLM-powered 2D models like LISA, it demonstrates a remarkable capacity to\nprocess highly complex text queries, including those that require intricate\nreasoning and world knowledge. The code and model will be made publicly\navailable.\n","authors":["Zhening Huang","Xiaoyang Wu","Xi Chen","Hengshuang Zhao","Lei Zhu","Joan Lasenby"],"pdf_url":"https://arxiv.org/pdf/2309.00616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00615v1","updated":"2023-09-01T17:59:47Z","published":"2023-09-01T17:59:47Z","title":"Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D\n Understanding, Generation, and Instruction Following","summary":" We introduce Point-Bind, a 3D multi-modality model aligning point clouds with\n2D image, language, audio, and video. Guided by ImageBind, we construct a joint\nembedding space between 3D and multi-modalities, enabling many promising\napplications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D\nopen-world understanding. On top of this, we further present Point-LLM, the\nfirst 3D large language model (LLM) following 3D multi-modal instructions. By\nparameter-efficient fine-tuning techniques, Point-LLM injects the semantics of\nPoint-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction\ndata, but exhibits superior 3D and multi-modal question-answering capacity. We\nhope our work may cast a light on the community for extending 3D point clouds\nto multi-modality applications. Code is available at\nhttps://github.com/ZiyuGuo99/Point-Bind_Point-LLM.\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Yiwen Tang","Xianzheng Ma","Jiaming Han","Kexin Chen","Peng Gao","Xianzhi Li","Hongsheng Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2309.00615v1.pdf","comment":"Work in progress. Code is available at\n https://github.com/ZiyuGuo99/Point-Bind_Point-LLM"},{"id":"http://arxiv.org/abs/2309.00613v1","updated":"2023-09-01T17:59:29Z","published":"2023-09-01T17:59:29Z","title":"Iterative Multi-granular Image Editing using Diffusion Models","summary":" Recent advances in text-guided image synthesis has dramatically changed how\ncreative professionals generate artistic and aesthetically pleasing visual\nassets. To fully support such creative endeavors, the process should possess\nthe ability to: 1) iteratively edit the generations and 2) control the spatial\nreach of desired changes (global, local or anything in between). We formalize\nthis pragmatic problem setting as Iterative Multi-granular Editing. While there\nhas been substantial progress with diffusion-based models for image synthesis\nand editing, they are all one shot (i.e., no iterative editing capabilities)\nand do not naturally yield multi-granular control (i.e., covering the full\nspectrum of local-to-global edits). To overcome these drawbacks, we propose\nEMILIE: Iterative Multi-granular Image Editor. EMILIE introduces a novel latent\niteration strategy, which re-purposes a pre-trained diffusion model to\nfacilitate iterative editing. This is complemented by a gradient control\noperation for multi-granular control. We introduce a new benchmark dataset to\nevaluate our newly proposed setting. We conduct exhaustive quantitatively and\nqualitatively evaluation against recent state-of-the-art approaches adapted to\nour task, to being out the mettle of EMILIE. We hope our work would attract\nattention to this newly identified, pragmatic problem setting.\n","authors":["K J Joseph","Prateksha Udhayanan","Tripti Shukla","Aishwarya Agarwal","Srikrishna Karanam","Koustava Goswami","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2309.00613v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2309.00610v1","updated":"2023-09-01T17:57:02Z","published":"2023-09-01T17:57:02Z","title":"CityDreamer: Compositional Generative Model of Unbounded 3D Cities","summary":" In recent years, extensive research has focused on 3D natural scene\ngeneration, but the domain of 3D city generation has not received as much\nexploration. This is due to the greater challenges posed by 3D city generation,\nmainly because humans are more sensitive to structural distortions in urban\nenvironments. Additionally, generating 3D cities is more complex than 3D\nnatural scenes since buildings, as objects of the same class, exhibit a wider\nrange of appearances compared to the relatively consistent appearance of\nobjects like trees in natural scenes. To address these challenges, we propose\nCityDreamer, a compositional generative model designed specifically for\nunbounded 3D cities, which separates the generation of building instances from\nother background objects, such as roads, green lands, and water areas, into\ndistinct modules. Furthermore, we construct two datasets, OSM and GoogleEarth,\ncontaining a vast amount of real-world city imagery to enhance the realism of\nthe generated 3D cities both in their layouts and appearances. Through\nextensive experiments, CityDreamer has proven its superiority over\nstate-of-the-art methods in generating a wide range of lifelike 3D cities.\n","authors":["Haozhe Xie","Zhaoxi Chen","Fangzhou Hong","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.00610v1.pdf","comment":"Project page: https://haozhexie.com/project/city-dreamer"},{"id":"http://arxiv.org/abs/2309.00594v1","updated":"2023-09-01T17:20:07Z","published":"2023-09-01T17:20:07Z","title":"Time Series Analysis of Urban Liveability","summary":" In this paper we explore deep learning models to monitor longitudinal\nliveability changes in Dutch cities at the neighbourhood level. Our liveability\nreference data is defined by a country-wise yearly survey based on a set of\nindicators combined into a liveability score, the Leefbaarometer. We pair this\nreference data with yearly-available high-resolution aerial images, which\ncreates yearly timesteps at which liveability can be monitored. We deploy a\nconvolutional neural network trained on an aerial image from 2016 and the\nLeefbaarometer score to predict liveability at new timesteps 2012 and 2020. The\nresults in a city used for training (Amsterdam) and one never seen during\ntraining (Eindhoven) show some trends which are difficult to interpret,\nespecially in light of the differences in image acquisitions at the different\ntime steps. This demonstrates the complexity of liveability monitoring across\ntime periods and the necessity for more sophisticated methods compensating for\nchanges unrelated to liveability dynamics.\n","authors":["Alex Levering","Diego Marcos","Devis Tuia"],"pdf_url":"https://arxiv.org/pdf/2309.00594v1.pdf","comment":"Accepted at JURSE 2023"},{"id":"http://arxiv.org/abs/2207.13240v3","updated":"2023-09-01T17:08:02Z","published":"2022-07-27T01:49:26Z","title":"Contrastive Image Synthesis and Self-supervised Feature Adaptation for\n Cross-Modality Biomedical Image Segmentation","summary":" This work presents a novel framework CISFA (Contrastive Image synthesis and\nSelf-supervised Feature Adaptation)that builds on image domain translation and\nunsupervised feature adaptation for cross-modality biomedical image\nsegmentation. Different from existing works, we use a one-sided generative\nmodel and add a weighted patch-wise contrastive loss between sampled patches of\nthe input image and the corresponding synthetic image, which serves as shape\nconstraints. Moreover, we notice that the generated images and input images\nshare similar structural information but are in different modalities. As such,\nwe enforce contrastive losses on the generated images and the input images to\ntrain the encoder of a segmentation model to minimize the discrepancy between\npaired images in the learned embedding space. Compared with existing works that\nrely on adversarial learning for feature adaptation, such a method enables the\nencoder to learn domain-independent features in a more explicit way. We\nextensively evaluate our methods on segmentation tasks containing CT and MRI\nimages for abdominal cavities and whole hearts. Experimental results show that\nthe proposed framework not only outputs synthetic images with less distortion\nof organ shapes, but also outperforms state-of-the-art domain adaptation\nmethods by a large margin.\n","authors":["Xinrong Hu","Corey Wang","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2207.13240v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00588v1","updated":"2023-09-01T17:04:48Z","published":"2023-09-01T17:04:48Z","title":"Discrete Morphological Neural Networks","summary":" A classical approach to designing binary image operators is Mathematical\nMorphology (MM). We propose the Discrete Morphological Neural Networks (DMNN)\nfor binary image analysis to represent W-operators and estimate them via\nmachine learning. A DMNN architecture, which is represented by a Morphological\nComputational Graph, is designed as in the classical heuristic design of\nmorphological operators, in which the designer should combine a set of MM\noperators and Boolean operations based on prior information and theoretical\nknowledge. Then, once the architecture is fixed, instead of adjusting its\nparameters (i.e., structural elements or maximal intervals) by hand, we propose\na lattice gradient descent algorithm (LGDA) to train these parameters based on\na sample of input and output images under the usual machine learning approach.\nWe also propose a stochastic version of the LGDA that is more efficient, is\nscalable and can obtain small error in practical problems. The class\nrepresented by a DMNN can be quite general or specialized according to expected\nproperties of the target operator, i.e., prior information, and the semantic\nexpressed by algebraic properties of classes of operators is a differential\nrelative to other methods. The main contribution of this paper is the merger of\nthe two main paradigms for designing morphological operators: classical\nheuristic design and automatic design via machine learning. Thus, conciliating\nclassical heuristic morphological operator design with machine learning. We\napply the DMNN to recognize the boundary of digits with noise, and we discuss\nmany topics for future research.\n","authors":["Diego Marcondes","Junior Barrera"],"pdf_url":"https://arxiv.org/pdf/2309.00588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00570v1","updated":"2023-09-01T16:30:02Z","published":"2023-09-01T16:30:02Z","title":"Mechanism of feature learning in convolutional neural networks","summary":" Understanding the mechanism of how convolutional neural networks learn\nfeatures from image data is a fundamental problem in machine learning and\ncomputer vision. In this work, we identify such a mechanism. We posit the\nConvolutional Neural Feature Ansatz, which states that covariances of filters\nin any convolutional layer are proportional to the average gradient outer\nproduct (AGOP) taken with respect to patches of the input to that layer. We\npresent extensive empirical evidence for our ansatz, including identifying high\ncorrelation between covariances of filters and patch-based AGOPs for\nconvolutional layers in standard neural architectures, such as AlexNet, VGG,\nand ResNets pre-trained on ImageNet. We also provide supporting theoretical\nevidence. We then demonstrate the generality of our result by using the\npatch-based AGOP to enable deep feature learning in convolutional kernel\nmachines. We refer to the resulting algorithm as (Deep) ConvRFM and show that\nour algorithm recovers similar features to deep convolutional networks\nincluding the notable emergence of edge detectors. Moreover, we find that Deep\nConvRFM overcomes previously identified limitations of convolutional kernels,\nsuch as their inability to adapt to local signals in images and, as a result,\nleads to sizable performance improvement over fixed convolutional kernels.\n","authors":["Daniel Beaglehole","Adityanarayanan Radhakrishnan","Parthe Pandit","Mikhail Belkin"],"pdf_url":"https://arxiv.org/pdf/2309.00570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00569v1","updated":"2023-09-01T16:26:42Z","published":"2023-09-01T16:26:42Z","title":"Amyloid-Beta Axial Plane PET Synthesis from Structural MRI: An Image\n Translation Approach for Screening Alzheimer's Disease","summary":" In this work, an image translation model is implemented to produce synthetic\namyloid-beta PET images from structural MRI that are quantitatively accurate.\nImage pairs of amyloid-beta PET and structural MRI were used to train the\nmodel. We found that the synthetic PET images could be produced with a high\ndegree of similarity to truth in terms of shape, contrast and overall high SSIM\nand PSNR. This work demonstrates that performing structural to quantitative\nimage translation is feasible to enable the access amyloid-beta information\nfrom only MRI.\n","authors":["Fernando Vega","Abdoljalil Addeh","M. Ethan MacDonald"],"pdf_url":"https://arxiv.org/pdf/2309.00569v1.pdf","comment":"Abstract submitted and presented to the International Society of\n Magnetic Resonance in Medicine (ISMRM 2023), Toronto, Canada"},{"id":"http://arxiv.org/abs/2305.07270v4","updated":"2023-09-01T16:17:54Z","published":"2023-05-12T06:17:57Z","title":"SSD-MonoDETR: Supervised Scale-aware Deformable Transformer for\n Monocular 3D Object Detection","summary":" Transformer-based methods have demonstrated superior performance for\nmonocular 3D object detection recently, which aims at predicting 3D attributes\nfrom a single 2D image. Most existing transformer-based methods leverage both\nvisual and depth representations to explore valuable query points on objects,\nand the quality of the learned query points has a great impact on detection\naccuracy. Unfortunately, existing unsupervised attention mechanisms in\ntransformers are prone to generate low-quality query features due to inaccurate\nreceptive fields, especially on hard objects. To tackle this problem, this\npaper proposes a novel \"Supervised Scale-aware Deformable Attention\" (SSDA) for\nmonocular 3D object detection. Specifically, SSDA presets several masks with\ndifferent scales and utilizes depth and visual features to adaptively learn a\nscale-aware filter for object query augmentation. Imposing the scale awareness,\nSSDA could well predict the accurate receptive field of an object query to\nsupport robust query feature generation. Aside from this, SSDA is assigned with\na Weighted Scale Matching (WSM) loss to supervise scale prediction, which\npresents more confident results as compared to the unsupervised attention\nmechanisms. Extensive experiments on the KITTI and Waymo Open datasets\ndemonstrate that SSDA significantly improves the detection accuracy, especially\non moderate and hard objects, yielding state-of-the-art performance as compared\nto the existing approaches. Our code will be made publicly available at\nhttps://github.com/mikasa3lili/SSD-MonoDETR.\n","authors":["Xuan He","Fan Yang","Kailun Yang","Jiacheng Lin","Haolong Fu","Meng Wang","Jin Yuan","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2305.07270v4.pdf","comment":"Accepted to IEEE Transactions on Intelligent Vehicles (T-IV). Code\n will be made publicly available at\n https://github.com/mikasa3lili/SSD-MonoDETR"},{"id":"http://arxiv.org/abs/2309.00549v1","updated":"2023-09-01T15:57:24Z","published":"2023-09-01T15:57:24Z","title":"Impact of Image Context for Single Deep Learning Face Morphing Attack\n Detection","summary":" The increase in security concerns due to technological advancements has led\nto the popularity of biometric approaches that utilize physiological or\nbehavioral characteristics for enhanced recognition. Face recognition systems\n(FRSs) have become prevalent, but they are still vulnerable to image\nmanipulation techniques such as face morphing attacks. This study investigates\nthe impact of the alignment settings of input images on deep learning face\nmorphing detection performance. We analyze the interconnections between the\nface contour and image context and suggest optimal alignment conditions for\nface morphing detection.\n","authors":["Joana Pimenta","Iurii Medvedev","Nuno Gonçalves"],"pdf_url":"https://arxiv.org/pdf/2309.00549v1.pdf","comment":"6 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.08600v2","updated":"2023-09-01T15:54:52Z","published":"2023-06-14T16:03:22Z","title":"M^2UNet: MetaFormer Multi-scale Upsampling Network for Polyp\n Segmentation","summary":" Polyp segmentation has recently garnered significant attention, and multiple\nmethods have been formulated to achieve commendable outcomes. However, these\ntechniques often confront difficulty when working with the complex polyp\nforeground and their surrounding regions because of the nature of convolution\noperation. Besides, most existing methods forget to exploit the potential\ninformation from multiple decoder stages. To address this challenge, we suggest\ncombining MetaFormer, introduced as a baseline for integrating CNN and\nTransformer, with UNet framework and incorporating our Multi-scale Upsampling\nblock (MU). This simple module makes it possible to combine multi-level\ninformation by exploring multiple receptive field paths of the shallow decoder\nstage and then adding with the higher stage to aggregate better feature\nrepresentation, which is essential in medical image segmentation. Taken all\ntogether, we propose MetaFormer Multi-scale Upsampling Network (M$^2$UNet) for\nthe polyp segmentation task. Extensive experiments on five benchmark datasets\ndemonstrate that our method achieved competitive performance compared with\nseveral previous methods.\n","authors":["Quoc-Huy Trinh","Nhat-Tan Bui","Trong-Hieu Nguyen Mau","Minh-Van Nguyen","Hai-Minh Phan","Minh-Triet Tran","Hai-Dang Nguyen"],"pdf_url":"https://arxiv.org/pdf/2306.08600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00528v1","updated":"2023-09-01T15:31:18Z","published":"2023-09-01T15:31:18Z","title":"Trust your Good Friends: Source-free Domain Adaptation by Reciprocal\n Neighborhood Clustering","summary":" Domain adaptation (DA) aims to alleviate the domain shift between source\ndomain and target domain. Most DA methods require access to the source data,\nbut often that is not possible (e.g. due to data privacy or intellectual\nproperty). In this paper, we address the challenging source-free domain\nadaptation (SFDA) problem, where the source pretrained model is adapted to the\ntarget domain in the absence of source data. Our method is based on the\nobservation that target data, which might not align with the source domain\nclassifier, still forms clear clusters. We capture this intrinsic structure by\ndefining local affinity of the target data, and encourage label consistency\namong data with high local affinity. We observe that higher affinity should be\nassigned to reciprocal neighbors. To aggregate information with more context,\nwe consider expanded neighborhoods with small affinity values. Furthermore, we\nconsider the density around each target sample, which can alleviate the\nnegative impact of potential outliers. In the experimental results we verify\nthat the inherent structure of the target features is an important source of\ninformation for domain adaptation. We demonstrate that this local structure can\nbe efficiently captured by considering the local neighbors, the reciprocal\nneighbors, and the expanded neighborhood. Finally, we achieve state-of-the-art\nperformance on several 2D image and 3D point cloud recognition datasets.\n","authors":["Shiqi Yang","Yaxing Wang","Joost van de Weijer","Luis Herranz","Shangling Jui","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2309.00528v1.pdf","comment":"Accepted by IEEE TPAMI, extended version of conference paper\n arXiv:2110.04202"},{"id":"http://arxiv.org/abs/2309.00526v1","updated":"2023-09-01T15:27:45Z","published":"2023-09-01T15:27:45Z","title":"SQLdepth: Generalizable Self-Supervised Fine-Structured Monocular Depth\n Estimation","summary":" Recently, self-supervised monocular depth estimation has gained popularity\nwith numerous applications in autonomous driving and robotics. However,\nexisting solutions primarily seek to estimate depth from immediate visual\nfeatures, and struggle to recover fine-grained scene details with limited\ngeneralization. In this paper, we introduce SQLdepth, a novel approach that can\neffectively learn fine-grained scene structures from motion. In SQLdepth, we\npropose a novel Self Query Layer (SQL) to build a self-cost volume and infer\ndepth from it, rather than inferring depth from feature maps. The self-cost\nvolume implicitly captures the intrinsic geometry of the scene within a single\nframe. Each individual slice of the volume signifies the relative distances\nbetween points and objects within a latent space. Ultimately, this volume is\ncompressed to the depth map via a novel decoding approach. Experimental results\non KITTI and Cityscapes show that our method attains remarkable\nstate-of-the-art performance (AbsRel = $0.082$ on KITTI, $0.052$ on KITTI with\nimproved ground-truth and $0.106$ on Cityscapes), achieves $9.9\\%$, $5.5\\%$ and\n$4.5\\%$ error reduction from the previous best. In addition, our approach\nshowcases reduced training complexity, computational efficiency, improved\ngeneralization, and the ability to recover fine-grained scene details.\nMoreover, the self-supervised pre-trained and metric fine-tuned SQLdepth can\nsurpass existing supervised methods by significant margins (AbsRel = $0.043$,\n$14\\%$ error reduction). self-matching-oriented relative distance querying in\nSQL improves the robustness and zero-shot generalization capability of\nSQLdepth. Code and the pre-trained weights will be publicly available. Code is\navailable at\n\\href{https://github.com/hisfog/SQLdepth-Impl}{https://github.com/hisfog/SQLdepth-Impl}.\n","authors":["Youhong Wang","Yunji Liang","Hao Xu","Shaohui Jiao","Hongkai Yu"],"pdf_url":"https://arxiv.org/pdf/2309.00526v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2205.01931v3","updated":"2023-09-01T15:26:29Z","published":"2022-05-04T08:06:55Z","title":"Mapping the landscape of histomorphological cancer phenotypes using\n self-supervised learning on unlabeled, unannotated pathology slides","summary":" Definitive cancer diagnosis and management depend upon the extraction of\ninformation from microscopy images by pathologists. These images contain\ncomplex information requiring time-consuming expert human interpretation that\nis prone to human bias. Supervised deep learning approaches have proven\npowerful for classification tasks, but they are inherently limited by the cost\nand quality of annotations used for training these models. To address this\nlimitation of supervised methods, we developed Histomorphological Phenotype\nLearning (HPL), a fully blue{self-}supervised methodology that requires no\nexpert labels or annotations and operates via the automatic discovery of\ndiscriminatory image features in small image tiles. Tiles are grouped into\nmorphologically similar clusters which constitute a library of\nhistomorphological phenotypes, revealing trajectories from benign to malignant\ntissue via inflammatory and reactive phenotypes. These clusters have distinct\nfeatures which can be identified using orthogonal methods, linking histologic,\nmolecular and clinical phenotypes. Applied to lung cancer tissues, we show that\nthey align closely with patient survival, with histopathologically recognised\ntumor types and growth patterns, and with transcriptomic measures of\nimmunophenotype. We then demonstrate that these properties are maintained in a\nmulti-cancer study. These results show the clusters represent recurrent host\nresponses and modes of tumor growth emerging under natural selection. Code,\npre-trained models, learned embeddings, and documentation are available to the\ncommunity at\nhttps://github.com/AdalbertoCq/Histomorphological-Phenotype-Learning\n","authors":["Adalberto Claudio Quiros","Nicolas Coudray","Anna Yeaton","Xinyu Yang","Bojing Liu","Hortense Le","Luis Chiriboga","Afreen Karimkhan","Navneet Narula","David A. Moore","Christopher Y. Park","Harvey Pass","Andre L. Moreira","John Le Quesne","Aristotelis Tsirigos","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2205.01931v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03763v2","updated":"2023-09-01T15:22:19Z","published":"2023-04-07T17:57:20Z","title":"Clutter Detection and Removal in 3D Scenes with View-Consistent\n Inpainting","summary":" Removing clutter from scenes is essential in many applications, ranging from\nprivacy-concerned content filtering to data augmentation. In this work, we\npresent an automatic system that removes clutter from 3D scenes and inpaints\nwith coherent geometry and texture. We propose techniques for its two key\ncomponents: 3D segmentation from shared properties and 3D inpainting, both of\nwhich are important problems. The definition of 3D scene clutter\n(frequently-moving objects) is not well captured by commonly-studied object\ncategories in computer vision. To tackle the lack of well-defined clutter\nannotations, we group noisy fine-grained labels, leverage virtual rendering,\nand impose an instance-level area-sensitive loss. Once clutter is removed, we\ninpaint geometry and texture in the resulting holes by merging inpainted RGB-D\nimages. This requires novel voting and pruning strategies that guarantee\nmulti-view consistency across individually inpainted images for mesh\nreconstruction. Experiments on ScanNet and Matterport dataset show that our\nmethod outperforms baselines for clutter segmentation and 3D inpainting, both\nvisually and quantitatively.\n","authors":["Fangyin Wei","Thomas Funkhouser","Szymon Rusinkiewicz"],"pdf_url":"https://arxiv.org/pdf/2304.03763v2.pdf","comment":"18 pages. ICCV 2023. Project page:\n https://weify627.github.io/clutter/"},{"id":"http://arxiv.org/abs/2309.00514v1","updated":"2023-09-01T15:06:39Z","published":"2023-09-01T15:06:39Z","title":"A Machine Vision Method for Correction of Eccentric Error: Based on\n Adaptive Enhancement Algorithm","summary":" In the procedure of surface defects detection for large-aperture aspherical\noptical elements, it is of vital significance to adjust the optical axis of the\nelement to be coaxial with the mechanical spin axis accurately. Therefore, a\nmachine vision method for eccentric error correction is proposed in this paper.\nFocusing on the severe defocus blur of reference crosshair image caused by the\nimaging characteristic of the aspherical optical element, which may lead to the\nfailure of correction, an Adaptive Enhancement Algorithm (AEA) is proposed to\nstrengthen the crosshair image. AEA is consisted of existed Guided Filter Dark\nChannel Dehazing Algorithm (GFA) and proposed lightweight Multi-scale Densely\nConnected Network (MDC-Net). The enhancement effect of GFA is excellent but\ntime-consuming, and the enhancement effect of MDC-Net is slightly inferior but\nstrongly real-time. As AEA will be executed dozens of times during each\ncorrection procedure, its real-time performance is very important. Therefore,\nby setting the empirical threshold of definition evaluation function SMD2, GFA\nand MDC-Net are respectively applied to highly and slightly blurred crosshair\nimages so as to ensure the enhancement effect while saving as much time as\npossible. AEA has certain robustness in time-consuming performance, which takes\nan average time of 0.2721s and 0.0963s to execute GFA and MDC-Net separately on\nten 200pixels 200pixels Region of Interest (ROI) images with different degrees\nof blur. And the eccentricity error can be reduced to within 10um by our\nmethod.\n","authors":["Fanyi Wang","Pin Cao","Yihui Zhang","Haotian Hu","Yongying Yang"],"pdf_url":"https://arxiv.org/pdf/2309.00514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00494v1","updated":"2023-09-01T14:40:25Z","published":"2023-09-01T14:40:25Z","title":"Multi-stage Deep Learning Artifact Reduction for Computed Tomography","summary":" In Computed Tomography (CT), an image of the interior structure of an object\nis computed from a set of acquired projection images. The quality of these\nreconstructed images is essential for accurate analysis, but this quality can\nbe degraded by a variety of imaging artifacts. To improve reconstruction\nquality, the acquired projection images are often processed by a pipeline\nconsisting of multiple artifact-removal steps applied in various image domains\n(e.g., outlier removal on projection images and denoising of reconstruction\nimages). These artifact-removal methods exploit the fact that certain artifacts\nare easier to remove in a certain domain compared with other domains.\n Recently, deep learning methods have shown promising results for artifact\nremoval for CT images. However, most existing deep learning methods for CT are\napplied as a post-processing method after reconstruction. Therefore, artifacts\nthat are relatively difficult to remove in the reconstruction domain may not be\neffectively removed by these methods. As an alternative, we propose a\nmulti-stage deep learning method for artifact removal, in which neural networks\nare applied to several domains, similar to a classical CT processing pipeline.\nWe show that the neural networks can be effectively trained in succession,\nresulting in easy-to-use and computationally efficient training. Experiments on\nboth simulated and real-world experimental datasets show that our method is\neffective in reducing artifacts and superior to deep learning-based\npost-processing.\n","authors":["Jiayang Shi","Daniel M. Pelt","K. Joost Batenburg"],"pdf_url":"https://arxiv.org/pdf/2309.00494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00474v1","updated":"2023-09-01T14:13:22Z","published":"2023-09-01T14:13:22Z","title":"Asymmetric double-winged multi-view clustering network for exploring\n Diverse and Consistent Information","summary":" In unsupervised scenarios, deep contrastive multi-view clustering (DCMVC) is\nbecoming a hot research spot, which aims to mine the potential relationships\nbetween different views. Most existing DCMVC algorithms focus on exploring the\nconsistency information for the deep semantic features, while ignoring the\ndiverse information on shallow features. To fill this gap, we propose a novel\nmulti-view clustering network termed CodingNet to explore the diverse and\nconsistent information simultaneously in this paper. Specifically, instead of\nutilizing the conventional auto-encoder, we design an asymmetric structure\nnetwork to extract shallow and deep features separately. Then, by aligning the\nsimilarity matrix on the shallow feature to the zero matrix, we ensure the\ndiversity for the shallow features, thus offering a better description of\nmulti-view data. Moreover, we propose a dual contrastive mechanism that\nmaintains consistency for deep features at both view-feature and pseudo-label\nlevels. Our framework's efficacy is validated through extensive experiments on\nsix widely used benchmark datasets, outperforming most state-of-the-art\nmulti-view clustering algorithms.\n","authors":["Qun Zheng","Xihong Yang","Siwei Wang","Xinru An","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2309.00474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00472v1","updated":"2023-09-01T14:11:19Z","published":"2023-09-01T14:11:19Z","title":"General and Practical Tuning Method for Off-the-Shelf Graph-Based Index:\n SISAP Indexing Challenge Report by Team UTokyo","summary":" Despite the efficacy of graph-based algorithms for Approximate Nearest\nNeighbor (ANN) searches, the optimal tuning of such systems remains unclear.\nThis study introduces a method to tune the performance of off-the-shelf\ngraph-based indexes, focusing on the dimension of vectors, database size, and\nentry points of graph traversal. We utilize a black-box optimization algorithm\nto perform integrated tuning to meet the required levels of recall and Queries\nPer Second (QPS). We applied our approach to Task A of the SISAP 2023 Indexing\nChallenge and got second place in the 10M and 30M tracks. It improves\nperformance substantially compared to brute force methods. This research offers\na universally applicable tuning method for graph-based indexes, extending\nbeyond the specific conditions of the competition to broader uses.\n","authors":["Yutaro Oguri","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2309.00472v1.pdf","comment":"Accepted paper on 2nd place solution of SISAP 2023 Indexing Challenge\n Task A"},{"id":"http://arxiv.org/abs/2309.00468v1","updated":"2023-09-01T14:09:10Z","published":"2023-09-01T14:09:10Z","title":"An Improved Encoder-Decoder Framework for Food EnergyEstimation","summary":" Dietary assessment is essential to maintaining a healthy lifestyle. Automatic\nimage-based dietary assessment is a growing field of research due to the\nincreasing prevalence of image capturing devices (e.g. mobile phones). In this\nwork, we estimate food energy from a single monocular image, a difficult task\ndue to the limited hard-to-extract amount of energy information present in an\nimage. To do so, we employ an improved encoder-decoder framework for energy\nestimation; the encoder transforms the image into a representation embedded\nwith food energy information in an easier-to-extract format, which the decoder\nthen extracts the energy information from. To implement our method, we compile\na high-quality food image dataset verified by registered dietitians containing\neating scene images, food-item segmentation masks, and ground truth calorie\nvalues. Our method improves upon previous caloric estimation methods by over\n10\\% and 30 kCal in terms of MAPE and MAE respectively.\n","authors":["Jack Ma","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.00468v1.pdf","comment":"Accepted for Madima'23 in ACM Multimedia"},{"id":"http://arxiv.org/abs/2309.00464v1","updated":"2023-09-01T14:02:44Z","published":"2023-09-01T14:02:44Z","title":"A Theoretical and Practical Framework for Evaluating Uncertainty\n Calibration in Object Detection","summary":" The proliferation of Deep Neural Networks has resulted in machine learning\nsystems becoming increasingly more present in various real-world applications.\nConsequently, there is a growing demand for highly reliable models in these\ndomains, making the problem of uncertainty calibration pivotal, when\nconsidering the future of deep learning. This is especially true when\nconsidering object detection systems, that are commonly present in\nsafety-critical application such as autonomous driving and robotics. For this\nreason, this work presents a novel theoretical and practical framework to\nevaluate object detection systems in the context of uncertainty calibration.\nThe robustness of the proposed uncertainty calibration metrics is shown through\na series of representative experiments. Code for the proposed uncertainty\ncalibration metrics at:\nhttps://github.com/pedrormconde/Uncertainty_Calibration_Object_Detection.\n","authors":["Pedro Conde","Rui L. Lopes","Cristiano Premebida"],"pdf_url":"https://arxiv.org/pdf/2309.00464v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2008.11516v2","updated":"2023-09-01T14:02:37Z","published":"2020-08-26T12:24:23Z","title":"Making a Case for 3D Convolutions for Object Segmentation in Videos","summary":" The task of object segmentation in videos is usually accomplished by\nprocessing appearance and motion information separately using standard 2D\nconvolutional networks, followed by a learned fusion of the two sources of\ninformation. On the other hand, 3D convolutional networks have been\nsuccessfully applied for video classification tasks, but have not been\nleveraged as effectively to problems involving dense per-pixel interpretation\nof videos compared to their 2D convolutional counterparts and lag behind the\naforementioned networks in terms of performance. In this work, we show that 3D\nCNNs can be effectively applied to dense video prediction tasks such as salient\nobject segmentation. We propose a simple yet effective encoder-decoder network\narchitecture consisting entirely of 3D convolutions that can be trained\nend-to-end using a standard cross-entropy loss. To this end, we leverage an\nefficient 3D encoder, and propose a 3D decoder architecture, that comprises\nnovel 3D Global Convolution layers and 3D Refinement modules. Our approach\noutperforms existing state-of-the-arts by a large margin on the DAVIS'16\nUnsupervised, FBMS and ViSal dataset benchmarks in addition to being faster,\nthus showing that our architecture can efficiently learn expressive\nspatio-temporal features and produce high quality video segmentation masks. We\nhave made our code and trained models publicly available at\nhttps://github.com/sabarim/3DC-Seg.\n","authors":["Sabarinath Mahadevan","Ali Athar","Aljoša Ošep","Sebastian Hennen","Laura Leal-Taixé","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2008.11516v2.pdf","comment":"BMVC '20"},{"id":"http://arxiv.org/abs/2208.00487v3","updated":"2023-09-01T14:02:16Z","published":"2022-07-31T18:17:04Z","title":"One Object at a Time: Accurate and Robust Structure From Motion for\n Robots","summary":" A gaze-fixating robot perceives distance to the fixated object and relative\npositions of surrounding objects immediately, accurately, and robustly. We show\nhow fixation, which is the act of looking at one object while moving, exploits\nregularities in the geometry of 3D space to obtain this information. These\nregularities introduce rotation-translation couplings that are not commonly\nused in structure from motion. To validate, we use a Franka Emika Robot with an\nRGB camera. We a) find that error in distance estimate is less than 5 mm at a\ndistance of 15 cm, and b) show how relative position can be used to find\nobstacles under challenging scenarios. We combine accurate distance estimates\nand obstacle information into a reactive robot behavior that is able to pick up\nobjects of unknown size, while impeded by unforeseen obstacles. Project page:\nhttps://oxidification.com/p/one-object-at-a-time/ .\n","authors":["Aravind Battaje","Oliver Brock"],"pdf_url":"https://arxiv.org/pdf/2208.00487v3.pdf","comment":"v3: Add link to project page v2: Update DOI v1: Accepted at 2022\n IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)"},{"id":"http://arxiv.org/abs/2308.13392v2","updated":"2023-09-01T13:58:57Z","published":"2023-08-25T14:08:07Z","title":"Self-Supervised Representation Learning with Cross-Context Learning\n between Global and Hypercolumn Features","summary":" Whilst contrastive learning yields powerful representations by matching\ndifferent augmented views of the same instance, it lacks the ability to capture\nthe similarities between different instances. One popular way to address this\nlimitation is by learning global features (after the global pooling) to capture\ninter-instance relationships based on knowledge distillation, where the global\nfeatures of the teacher are used to guide the learning of the global features\nof the student. Inspired by cross-modality learning, we extend this existing\nframework that only learns from global features by encouraging the global\nfeatures and intermediate layer features to learn from each other. This leads\nto our novel self-supervised framework: cross-context learning between global\nand hypercolumn features (CGH), that enforces the consistency of instance\nrelations between low- and high-level semantics. Specifically, we stack the\nintermediate feature maps to construct a hypercolumn representation so that we\ncan measure instance relations using two contexts (hypercolumn and global\nfeature) separately, and then use the relations of one context to guide the\nlearning of the other. This cross-context learning allows the model to learn\nfrom the differences between the two contexts. The experimental results on\nlinear classification and downstream tasks show that our method outperforms the\nstate-of-the-art methods.\n","authors":["Zheng Gao","Chen Feng","Ioannis Patras"],"pdf_url":"https://arxiv.org/pdf/2308.13392v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00460v1","updated":"2023-09-01T13:46:24Z","published":"2023-09-01T13:46:24Z","title":"dacl10k: Benchmark for Semantic Bridge Damage Segmentation","summary":" Reliably identifying reinforced concrete defects (RCDs)plays a crucial role\nin assessing the structural integrity, traffic safety, and long-term durability\nof concrete bridges, which represent the most common bridge type worldwide.\nNevertheless, available datasets for the recognition of RCDs are small in terms\nof size and class variety, which questions their usability in real-world\nscenarios and their role as a benchmark. Our contribution to this problem is\n\"dacl10k\", an exceptionally diverse RCD dataset for multi-label semantic\nsegmentation comprising 9,920 images deriving from real-world bridge\ninspections. dacl10k distinguishes 12 damage classes as well as 6 bridge\ncomponents that play a key role in the building assessment and recommending\nactions, such as restoration works, traffic load limitations or bridge\nclosures. In addition, we examine baseline models for dacl10k which are\nsubsequently evaluated. The best model achieves a mean intersection-over-union\nof 0.42 on the test set. dacl10k, along with our baselines, will be openly\naccessible to researchers and practitioners, representing the currently biggest\ndataset regarding number of images and class diversity for semantic\nsegmentation in the bridge inspection domain.\n","authors":["Johannes Flotzinger","Philipp J. Rösch","Thomas Braml"],"pdf_url":"https://arxiv.org/pdf/2309.00460v1.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.00451v1","updated":"2023-09-01T13:29:26Z","published":"2023-09-01T13:29:26Z","title":"Unsupervised bias discovery in medical image segmentation","summary":" It has recently been shown that deep learning models for anatomical\nsegmentation in medical images can exhibit biases against certain\nsub-populations defined in terms of protected attributes like sex or ethnicity.\nIn this context, auditing fairness of deep segmentation models becomes crucial.\nHowever, such audit process generally requires access to ground-truth\nsegmentation masks for the target population, which may not always be\navailable, especially when going from development to deployment. Here we\npropose a new method to anticipate model biases in biomedical image\nsegmentation in the absence of ground-truth annotations. Our unsupervised bias\ndiscovery method leverages the reverse classification accuracy framework to\nestimate segmentation quality. Through numerical experiments in synthetic and\nrealistic scenarios we show how our method is able to successfully anticipate\nfairness issues in the absence of ground-truth labels, constituting a novel and\nvaluable tool in this field.\n","authors":["Nicolás Gaggion","Rodrigo Echeveste","Lucas Mansilla","Diego H. Milone","Enzo Ferrante"],"pdf_url":"https://arxiv.org/pdf/2309.00451v1.pdf","comment":"Accepted for publication at FAIMI 2023 (Fairness of AI in Medical\n Imaging) at MICCAI"},{"id":"http://arxiv.org/abs/2003.08429v4","updated":"2023-09-01T13:25:14Z","published":"2020-03-18T18:40:52Z","title":"STEm-Seg: Spatio-temporal Embeddings for Instance Segmentation in Videos","summary":" Existing methods for instance segmentation in videos typically involve\nmulti-stage pipelines that follow the tracking-by-detection paradigm and model\na video clip as a sequence of images. Multiple networks are used to detect\nobjects in individual frames, and then associate these detections over time.\nHence, these methods are often non-end-to-end trainable and highly tailored to\nspecific tasks. In this paper, we propose a different approach that is\nwell-suited to a variety of tasks involving instance segmentation in videos. In\nparticular, we model a video clip as a single 3D spatio-temporal volume, and\npropose a novel approach that segments and tracks instances across space and\ntime in a single stage. Our problem formulation is centered around the idea of\nspatio-temporal embeddings which are trained to cluster pixels belonging to a\nspecific object instance over an entire video clip. To this end, we introduce\n(i) novel mixing functions that enhance the feature representation of\nspatio-temporal embeddings, and (ii) a single-stage, proposal-free network that\ncan reason about temporal context. Our network is trained end-to-end to learn\nspatio-temporal embeddings as well as parameters required to cluster these\nembeddings, thus simplifying inference. Our method achieves state-of-the-art\nresults across multiple datasets and tasks. Code and models are available at\nhttps://github.com/sabarim/STEm-Seg.\n","authors":["Ali Athar","Sabarinath Mahadevan","Aljoša Ošep","Laura Leal-Taixé","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2003.08429v4.pdf","comment":"ECCV 2020 28 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.00434v1","updated":"2023-09-01T13:02:19Z","published":"2023-09-01T13:02:19Z","title":"Improving the matching of deformable objects by learning to detect\n keypoints","summary":" We propose a novel learned keypoint detection method to increase the number\nof correct matches for the task of non-rigid image correspondence. By\nleveraging true correspondences acquired by matching annotated image pairs with\na specified descriptor extractor, we train an end-to-end convolutional neural\nnetwork (CNN) to find keypoint locations that are more appropriate to the\nconsidered descriptor. For that, we apply geometric and photometric warpings to\nimages to generate a supervisory signal, allowing the optimization of the\ndetector. Experiments demonstrate that our method enhances the Mean Matching\nAccuracy of numerous descriptors when used in conjunction with our detection\nmethod, while outperforming the state-of-the-art keypoint detectors on real\nimages of non-rigid objects by 20 p.p. We also apply our method on the complex\nreal-world task of object retrieval where our detector performs on par with the\nfinest keypoint detectors currently available for this task. The source code\nand trained models are publicly available at\nhttps://github.com/verlab/LearningToDetect_PRL_2023\n","authors":["Felipe Cadar"," Welerson","Vaishnavi Kanagasabapathi","Guilherme Potje","Renato Martins","Erickson R. Nascimento"],"pdf_url":"https://arxiv.org/pdf/2309.00434v1.pdf","comment":"This is the accepted version of the paper to appear at Pattern\n Recognition Letters (PRL). The final journal version will be available at\n https://doi.org/10.1016/j.patrec.2023.08.012"},{"id":"http://arxiv.org/abs/2306.16805v2","updated":"2023-09-01T12:53:44Z","published":"2023-06-29T09:35:53Z","title":"CLIPAG: Towards Generator-Free Text-to-Image Generation","summary":" Perceptually Aligned Gradients (PAG) refer to an intriguing property observed\nin robust image classification models, wherein their input gradients align with\nhuman perception and pose semantic meanings. While this phenomenon has gained\nsignificant research attention, it was solely studied in the context of\nunimodal vision-only architectures. In this work, we extend the study of PAG to\nVision-Language architectures, which form the foundations for diverse\nimage-text tasks and applications. Through an adversarial robustification\nfinetuning of CLIP, we demonstrate that robust Vision-Language models exhibit\nPAG in contrast to their vanilla counterparts. This work reveals the merits of\nCLIP with PAG (CLIPAG) in several vision-language generative tasks. Notably, we\nshow that seamlessly integrating CLIPAG in a \"plug-n-play\" manner leads to\nsubstantial improvements in vision-language generative applications.\nFurthermore, leveraging its PAG property, CLIPAG enables text-to-image\ngeneration without any generative model, which typically requires huge\ngenerators.\n","authors":["Roy Ganz","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2306.16805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06443v2","updated":"2023-09-01T12:38:13Z","published":"2023-01-16T14:25:19Z","title":"Sparse resultant based minimal solvers in computer vision and their\n connection with the action matrix","summary":" Many computer vision applications require robust and efficient estimation of\ncamera geometry from a minimal number of input data measurements, i.e., solving\nminimal problems in a RANSAC framework. Minimal problems are usually formulated\nas complex systems of sparse polynomials. The systems usually are\noverdetermined and consist of polynomials with algebraically constrained\ncoefficients. Most state-of-the-art efficient polynomial solvers are based on\nthe action matrix method that has been automated and highly optimized in recent\nyears. On the other hand, the alternative theory of sparse resultants and\nNewton polytopes has been less successful for generating efficient solvers,\nprimarily because the polytopes do not respect the constraints on the\ncoefficients. Therefore, in this paper, we propose a simple iterative scheme to\ntest various subsets of the Newton polytopes and search for the most efficient\nsolver. Moreover, we propose to use an extra polynomial with a special form to\nfurther improve the solver efficiency via a Schur complement computation. We\nshow that for some camera geometry problems our extra polynomial-based method\nleads to smaller and more stable solvers than the state-of-the-art Grobner\nbasis-based solvers. The proposed method can be fully automated and\nincorporated into existing tools for automatic generation of efficient\npolynomial solvers. It provides a competitive alternative to popular Grobner\nbasis-based methods for minimal problems in computer vision. We also study the\nconditions under which the minimal solvers generated by the state-of-the-art\naction matrix-based methods and the proposed extra polynomial resultant-based\nmethod, are equivalent. Specifically we consider a step-by-step comparison\nbetween the approaches based on the action matrix and the sparse resultant,\nfollowed by a set of substitutions, which would lead to equivalent minimal\nsolvers.\n","authors":["Snehal Bhayani","Janne Heikkilä","Zuzana Kukelova"],"pdf_url":"https://arxiv.org/pdf/2301.06443v2.pdf","comment":"arXiv admin note: text overlap with arXiv:1912.10268"},{"id":"http://arxiv.org/abs/2309.00410v1","updated":"2023-09-01T12:07:40Z","published":"2023-09-01T12:07:40Z","title":"Selective Scene Text Removal","summary":" Scene text removal (STR) is the image transformation task to remove text\nregions in scene images. The conventional STR methods remove all scene text.\nThis means that the existing methods cannot select text to be removed. In this\npaper, we propose a novel task setting named selective scene text removal\n(SSTR) that removes only target words specified by the user. Although SSTR is a\nmore complex task than STR, the proposed multi-module structure enables\nefficient training for SSTR. Experimental results show that the proposed method\ncan remove target words as expected.\n","authors":["Hayato Mitani","Akisato Kimura","Seiichi Uchida"],"pdf_url":"https://arxiv.org/pdf/2309.00410v1.pdf","comment":"12 pages, 8 figures, Accepted at the 34th British Machine Vision\n Conference"},{"id":"http://arxiv.org/abs/2309.00399v1","updated":"2023-09-01T11:15:50Z","published":"2023-09-01T11:15:50Z","title":"Fine-grained Recognition with Learnable Semantic Data Augmentation","summary":" Fine-grained image recognition is a longstanding computer vision challenge\nthat focuses on differentiating objects belonging to multiple subordinate\ncategories within the same meta-category. Since images belonging to the same\nmeta-category usually share similar visual appearances, mining discriminative\nvisual cues is the key to distinguishing fine-grained categories. Although\ncommonly used image-level data augmentation techniques have achieved great\nsuccess in generic image classification problems, they are rarely applied in\nfine-grained scenarios, because their random editing-region behavior is prone\nto destroy the discriminative visual cues residing in the subtle regions. In\nthis paper, we propose diversifying the training data at the feature-level to\nalleviate the discriminative region loss problem. Specifically, we produce\ndiversified augmented samples by translating image features along semantically\nmeaningful directions. The semantic directions are estimated with a covariance\nprediction network, which predicts a sample-wise covariance matrix to adapt to\nthe large intra-class variation inherent in fine-grained images. Furthermore,\nthe covariance prediction network is jointly optimized with the classification\nnetwork in a meta-learning manner to alleviate the degenerate solution problem.\nExperiments on four competitive fine-grained recognition benchmarks\n(CUB-200-2011, Stanford Cars, FGVC Aircrafts, NABirds) demonstrate that our\nmethod significantly improves the generalization performance on several popular\nclassification networks (e.g., ResNets, DenseNets, EfficientNets, RegNets and\nViT). Combined with a recently proposed method, our semantic data augmentation\napproach achieves state-of-the-art performance on the CUB-200-2011 dataset. The\nsource code will be released.\n","authors":["Yifan Pu","Yizeng Han","Yulin Wang","Junlan Feng","Chao Deng","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2309.00399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00398v1","updated":"2023-09-01T11:14:43Z","published":"2023-09-01T11:14:43Z","title":"VideoGen: A Reference-Guided Latent Diffusion Approach for High\n Definition Text-to-Video Generation","summary":" In this paper, we present VideoGen, a text-to-video generation approach,\nwhich can generate a high-definition video with high frame fidelity and strong\ntemporal consistency using reference-guided latent diffusion. We leverage an\noff-the-shelf text-to-image generation model, e.g., Stable Diffusion, to\ngenerate an image with high content quality from the text prompt, as a\nreference image to guide video generation. Then, we introduce an efficient\ncascaded latent diffusion module conditioned on both the reference image and\nthe text prompt, for generating latent video representations, followed by a\nflow-based temporal upsampling step to improve the temporal resolution.\nFinally, we map latent video representations into a high-definition video\nthrough an enhanced video decoder. During training, we use the first frame of a\nground-truth video as the reference image for training the cascaded latent\ndiffusion module. The main characterises of our approach include: the reference\nimage generated by the text-to-image model improves the visual fidelity; using\nit as the condition makes the diffusion model focus more on learning the video\ndynamics; and the video decoder is trained over unlabeled video data, thus\nbenefiting from high-quality easily-available videos. VideoGen sets a new\nstate-of-the-art in text-to-video generation in terms of both qualitative and\nquantitative evaluation.\n","authors":["Xin Li","Wenqing Chu","Ye Wu","Weihang Yuan","Fanglong Liu","Qi Zhang","Fu Li","Haocheng Feng","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00398v1.pdf","comment":"8pages, 8figures"},{"id":"http://arxiv.org/abs/2306.07881v2","updated":"2023-09-01T11:09:36Z","published":"2023-06-13T16:18:51Z","title":"Viewset Diffusion: (0-)Image-Conditioned 3D Generative Models from 2D\n Data","summary":" We present Viewset Diffusion, a diffusion-based generator that outputs 3D\nobjects while only using multi-view 2D data for supervision. We note that there\nexists a one-to-one mapping between viewsets, i.e., collections of several 2D\nviews of an object, and 3D models. Hence, we train a diffusion model to\ngenerate viewsets, but design the neural network generator to reconstruct\ninternally corresponding 3D models, thus generating those too. We fit a\ndiffusion model to a large number of viewsets for a given category of objects.\nThe resulting generator can be conditioned on zero, one or more input views.\nConditioned on a single view, it performs 3D reconstruction accounting for the\nambiguity of the task and allowing to sample multiple solutions compatible with\nthe input. The model performs reconstruction efficiently, in a feed-forward\nmanner, and is trained using only rendering losses using as few as three views\nper viewset. Project page: szymanowiczs.github.io/viewset-diffusion.\n","authors":["Stanislaw Szymanowicz","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2306.07881v2.pdf","comment":"International Conference on Computer Vision 2023"},{"id":"http://arxiv.org/abs/2309.00385v1","updated":"2023-09-01T10:46:57Z","published":"2023-09-01T10:46:57Z","title":"Dense Voxel 3D Reconstruction Using a Monocular Event Camera","summary":" Event cameras are sensors inspired by biological systems that specialize in\ncapturing changes in brightness. These emerging cameras offer many advantages\nover conventional frame-based cameras, including high dynamic range, high frame\nrates, and extremely low power consumption. Due to these advantages, event\ncameras have increasingly been adapted in various fields, such as frame\ninterpolation, semantic segmentation, odometry, and SLAM. However, their\napplication in 3D reconstruction for VR applications is underexplored. Previous\nmethods in this field mainly focused on 3D reconstruction through depth map\nestimation. Methods that produce dense 3D reconstruction generally require\nmultiple cameras, while methods that utilize a single event camera can only\nproduce a semi-dense result. Other single-camera methods that can produce dense\n3D reconstruction rely on creating a pipeline that either incorporates the\naforementioned methods or other existing Structure from Motion (SfM) or\nMulti-view Stereo (MVS) methods. In this paper, we propose a novel approach for\nsolving dense 3D reconstruction using only a single event camera. To the best\nof our knowledge, our work is the first attempt in this regard. Our preliminary\nresults demonstrate that the proposed method can produce visually\ndistinguishable dense 3D reconstructions directly without requiring pipelines\nlike those used by existing methods. Additionally, we have created a synthetic\ndataset with $39,739$ object scans using an event camera simulator. This\ndataset will help accelerate other relevant research in this field.\n","authors":["Haodong Chen","Vera Chung","Li Tan","Xiaoming Chen"],"pdf_url":"https://arxiv.org/pdf/2309.00385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00378v1","updated":"2023-09-01T10:27:04Z","published":"2023-09-01T10:27:04Z","title":"Long-Term Memorability On Advertisements","summary":" Marketers spend billions of dollars on advertisements but to what end? At the\npurchase time, if customers cannot recognize a brand for which they saw an ad,\nthe money spent on the ad is essentially wasted. Despite its importance in\nmarketing, until now, there has been no study on the memorability of ads in the\nML literature. Most studies have been conducted on short-term recall (<5 mins)\non specific content types like object and action videos. On the other hand, the\nadvertising industry only cares about long-term memorability (a few hours or\nlonger), and advertisements are almost always highly multimodal, depicting a\nstory through its different modalities (text, images, and videos). With this\nmotivation, we conduct the first large scale memorability study consisting of\n1203 participants and 2205 ads covering 276 brands. Running statistical tests\nover different participant subpopulations and ad-types, we find many\ninteresting insights into what makes an ad memorable - both content and human\nfactors. For example, we find that brands which use commercials with fast\nmoving scenes are more memorable than those with slower scenes (p=8e-10) and\nthat people who use ad-blockers remember lower number of ads than those who\ndon't (p=5e-3). Further, with the motivation of simulating the memorability of\nmarketing materials for a particular audience, ultimately helping create one,\nwe present a novel model, Sharingan, trained to leverage real-world knowledge\nof LLMs and visual knowledge of visual encoders to predict the memorability of\na content. We test our model on all the prominent memorability datasets in\nliterature (both images and videos) and achieve state of the art across all of\nthem. We conduct extensive ablation studies across memory types, modality,\nbrand, and architectural choices to find insights into what drives memory.\n","authors":["Harini S I","Somesh Singh","Yaman K Singla","Aanisha Bhattacharyya","Veeky Baths","Changyou Chen","Rajiv Ratn Shah","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00372v1","updated":"2023-09-01T10:10:46Z","published":"2023-09-01T10:10:46Z","title":"On the Localization of Ultrasound Image Slices within Point Distribution\n Models","summary":" Thyroid disorders are most commonly diagnosed using high-resolution\nUltrasound (US). Longitudinal nodule tracking is a pivotal diagnostic protocol\nfor monitoring changes in pathological thyroid morphology. This task, however,\nimposes a substantial cognitive load on clinicians due to the inherent\nchallenge of maintaining a mental 3D reconstruction of the organ. We thus\npresent a framework for automated US image slice localization within a 3D shape\nrepresentation to ease how such sonographic diagnoses are carried out. Our\nproposed method learns a common latent embedding space between US image patches\nand the 3D surface of an individual's thyroid shape, or a statistical\naggregation in the form of a statistical shape model (SSM), via contrastive\nmetric learning. Using cross-modality registration and Procrustes analysis, we\nleverage features from our model to register US slices to a 3D mesh\nrepresentation of the thyroid shape. We demonstrate that our multi-modal\nregistration framework can localize images on the 3D surface topology of a\npatient-specific organ and the mean shape of an SSM. Experimental results\nindicate slice positions can be predicted within an average of 1.2 mm of the\nground-truth slice location on the patient-specific 3D anatomy and 4.6 mm on\nthe SSM, exemplifying its usefulness for slice localization during sonographic\nacquisitions. Code is publically available:\n\\href{https://github.com/vuenc/slice-to-shape}{https://github.com/vuenc/slice-to-shape}\n","authors":["Lennart Bastian","Vincent Bürgin","Ha Young Kim","Alexander Baumann","Benjamin Busam","Mahdi Saleh","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2309.00372v1.pdf","comment":"ShapeMI Workshop @ MICCAI 2023; 12 pages 2 figures"},{"id":"http://arxiv.org/abs/2308.16891v2","updated":"2023-09-01T10:00:11Z","published":"2023-08-31T17:52:10Z","title":"GNFactor: Multi-Task Real Robot Learning with Generalizable Neural\n Feature Fields","summary":" It is a long-standing problem in robotics to develop agents capable of\nexecuting diverse manipulation tasks from visual observations in unstructured\nreal-world environments. To achieve this goal, the robot needs to have a\ncomprehensive understanding of the 3D structure and semantics of the scene. In\nthis work, we present $\\textbf{GNFactor}$, a visual behavior cloning agent for\nmulti-task robotic manipulation with $\\textbf{G}$eneralizable $\\textbf{N}$eural\nfeature $\\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural\nfield (GNF) as a reconstruction module and a Perceiver Transformer as a\ndecision-making module, leveraging a shared deep 3D voxel representation. To\nincorporate semantics in 3D, the reconstruction module utilizes a\nvision-language foundation model ($\\textit{e.g.}$, Stable Diffusion) to distill\nrich semantic information into the deep 3D voxel. We evaluate GNFactor on 3\nreal robot tasks and perform detailed ablations on 10 RLBench tasks with a\nlimited number of demonstrations. We observe a substantial improvement of\nGNFactor over current state-of-the-art methods in seen and unseen tasks,\ndemonstrating the strong generalization ability of GNFactor. Our project\nwebsite is https://yanjieze.com/GNFactor/ .\n","authors":["Yanjie Ze","Ge Yan","Yueh-Hua Wu","Annabella Macaluso","Yuying Ge","Jianglong Ye","Nicklas Hansen","Li Erran Li","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16891v2.pdf","comment":"CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/"},{"id":"http://arxiv.org/abs/2207.00026v4","updated":"2023-09-01T09:58:15Z","published":"2022-06-30T18:00:04Z","title":"LaserMix for Semi-Supervised LiDAR Semantic Segmentation","summary":" Densely annotating LiDAR point clouds is costly, which restrains the\nscalability of fully-supervised learning methods. In this work, we study the\nunderexplored semi-supervised learning (SSL) in LiDAR segmentation. Our core\nidea is to leverage the strong spatial cues of LiDAR point clouds to better\nexploit unlabeled data. We propose LaserMix to mix laser beams from different\nLiDAR scans, and then encourage the model to make consistent and confident\npredictions before and after mixing. Our framework has three appealing\nproperties: 1) Generic: LaserMix is agnostic to LiDAR representations (e.g.,\nrange view and voxel), and hence our SSL framework can be universally applied.\n2) Statistically grounded: We provide a detailed analysis to theoretically\nexplain the applicability of the proposed framework. 3) Effective:\nComprehensive experimental analysis on popular LiDAR segmentation datasets\n(nuScenes, SemanticKITTI, and ScribbleKITTI) demonstrates our effectiveness and\nsuperiority. Notably, we achieve competitive results over fully-supervised\ncounterparts with 2x to 5x fewer labels and improve the supervised-only\nbaseline significantly by 10.8% on average. We hope this concise yet\nhigh-performing framework could facilitate future research in semi-supervised\nLiDAR segmentation. Code is publicly available.\n","authors":["Lingdong Kong","Jiawei Ren","Liang Pan","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2207.00026v4.pdf","comment":"CVPR 2023 (Highlight); 27 pages, 11 figures, 12 tables; Code at\n https://github.com/ldkong1205/LaserMix"},{"id":"http://arxiv.org/abs/2309.00359v1","updated":"2023-09-01T09:34:49Z","published":"2023-09-01T09:34:49Z","title":"Large Content And Behavior Models To Understand, Simulate, And Optimize\n Content And Behavior","summary":" Shannon, in his seminal paper introducing information theory, divided the\ncommunication into three levels: technical, semantic, and effectivenss. While\nthe technical level is concerned with accurate reconstruction of transmitted\nsymbols, the semantic and effectiveness levels deal with the inferred meaning\nand its effect on the receiver. Thanks to telecommunications, the first level\nproblem has produced great advances like the internet. Large Language Models\n(LLMs) make some progress towards the second goal, but the third level still\nremains largely untouched. The third problem deals with predicting and\noptimizing communication for desired receiver behavior. LLMs, while showing\nwide generalization capabilities across a wide range of tasks, are unable to\nsolve for this. One reason for the underperformance could be a lack of\n\"behavior tokens\" in LLMs' training corpora. Behavior tokens define receiver\nbehavior over a communication, such as shares, likes, clicks, purchases,\nretweets, etc. While preprocessing data for LLM training, behavior tokens are\noften removed from the corpora as noise. Therefore, in this paper, we make some\ninitial progress towards reintroducing behavior tokens in LLM training. The\ntrained models, other than showing similar performance to LLMs on content\nunderstanding tasks, show generalization capabilities on behavior simulation,\ncontent simulation, behavior understanding, and behavior domain adaptation.\nUsing a wide range of tasks on two corpora, we show results on all these\ncapabilities. We call these models Large Content and Behavior Models (LCBMs).\nFurther, to spur more research on LCBMs, we release our new Content Behavior\nCorpus (CBC), a repository containing communicator, message, and corresponding\nreceiver behavior.\n","authors":["Ashmit Khandelwal","Aditya Agrawal","Aanisha Bhattacharyya","Yaman K Singla","Somesh Singh","Uttaran Bhattacharya","Ishita Dasgupta","Stefano Petrangeli","Rajiv Ratn Shah","Changyou Chen","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00350v1","updated":"2023-09-01T09:15:06Z","published":"2023-09-01T09:15:06Z","title":"How You Split Matters: Data Leakage and Subject Characteristics Studies\n in Longitudinal Brain MRI Analysis","summary":" Deep learning models have revolutionized the field of medical image analysis,\noffering significant promise for improved diagnostics and patient care.\nHowever, their performance can be misleadingly optimistic due to a hidden\npitfall called 'data leakage'. In this study, we investigate data leakage in 3D\nmedical imaging, specifically using 3D Convolutional Neural Networks (CNNs) for\nbrain MRI analysis. While 3D CNNs appear less prone to leakage than 2D\ncounterparts, improper data splitting during cross-validation (CV) can still\npose issues, especially with longitudinal imaging data containing repeated\nscans from the same subject. We explore the impact of different data splitting\nstrategies on model performance for longitudinal brain MRI analysis and\nidentify potential data leakage concerns. GradCAM visualization helps reveal\nshortcuts in CNN models caused by identity confounding, where the model learns\nto identify subjects along with diagnostic features. Our findings, consistent\nwith prior research, underscore the importance of subject-wise splitting and\nevaluating our model further on hold-out data from different subjects to ensure\nthe integrity and reliability of deep learning models in medical image\nanalysis.\n","authors":["Dewinda Julianensi Rumala"],"pdf_url":"https://arxiv.org/pdf/2309.00350v1.pdf","comment":"submitted to MICCAI FAIMI 2023"},{"id":"http://arxiv.org/abs/2305.16759v3","updated":"2023-09-01T09:13:10Z","published":"2023-05-26T09:21:56Z","title":"StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human","summary":" This paper tackles text-guided control of StyleGAN for editing garments in\nfull-body human images. Existing StyleGAN-based methods suffer from handling\nthe rich diversity of garments and body shapes and poses. We propose a\nframework for text-guided full-body human image synthesis via an\nattention-based latent code mapper, which enables more disentangled control of\nStyleGAN than existing mappers. Our latent code mapper adopts an attention\nmechanism that adaptively manipulates individual latent codes on different\nStyleGAN layers under text guidance. In addition, we introduce feature-space\nmasking at inference time to avoid unwanted changes caused by text inputs. Our\nquantitative and qualitative evaluations reveal that our method can control\ngenerated images more faithfully to given texts than existing methods.\n","authors":["Takato Yoshikawa","Yuki Endo","Yoshihiro Kanamori"],"pdf_url":"https://arxiv.org/pdf/2305.16759v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00348v1","updated":"2023-09-01T09:10:04Z","published":"2023-09-01T09:10:04Z","title":"MuraNet: Multi-task Floor Plan Recognition with Relation Attention","summary":" The recognition of information in floor plan data requires the use of\ndetection and segmentation models. However, relying on several single-task\nmodels can result in ineffective utilization of relevant information when there\nare multiple tasks present simultaneously. To address this challenge, we\nintroduce MuraNet, an attention-based multi-task model for segmentation and\ndetection tasks in floor plan data. In MuraNet, we adopt a unified encoder\ncalled MURA as the backbone with two separated branches: an enhanced\nsegmentation decoder branch and a decoupled detection head branch based on\nYOLOX, for segmentation and detection tasks respectively. The architecture of\nMuraNet is designed to leverage the fact that walls, doors, and windows usually\nconstitute the primary structure of a floor plan's architecture. By jointly\ntraining the model on both detection and segmentation tasks, we believe MuraNet\ncan effectively extract and utilize relevant features for both tasks. Our\nexperiments on the CubiCasa5k public dataset show that MuraNet improves\nconvergence speed during training compared to single-task models like U-Net and\nYOLOv3. Moreover, we observe improvements in the average AP and IoU in\ndetection and segmentation tasks, respectively.Our ablation experiments\ndemonstrate that the attention-based unified backbone of MuraNet achieves\nbetter feature extraction in floor plan recognition tasks, and the use of\ndecoupled multi-head branches for different tasks further improves model\nperformance. We believe that our proposed MuraNet model can address the\ndisadvantages of single-task models and improve the accuracy and efficiency of\nfloor plan data recognition.\n","authors":["Lingxiao Huang","Jung-Hsuan Wu","Chiching Wei","Wilson Li"],"pdf_url":"https://arxiv.org/pdf/2309.00348v1.pdf","comment":"Document Analysis and Recognition - ICDAR 2023 Workshops. ICDAR 2023.\n Lecture Notes in Computer Science, vol 14193. Springer, Cham"},{"id":"http://arxiv.org/abs/2309.00347v1","updated":"2023-09-01T09:08:21Z","published":"2023-09-01T09:08:21Z","title":"Towards Contrastive Learning in Music Video Domain","summary":" Contrastive learning is a powerful way of learning multimodal representations\nacross various domains such as image-caption retrieval and audio-visual\nrepresentation learning. In this work, we investigate if these findings\ngeneralize to the domain of music videos. Specifically, we create a dual\nen-coder for the audio and video modalities and train it using a bidirectional\ncontrastive loss. For the experiments, we use an industry dataset containing\n550 000 music videos as well as the public Million Song Dataset, and evaluate\nthe quality of learned representations on the downstream tasks of music tagging\nand genre classification. Our results indicate that pre-trained networks\nwithout contrastive fine-tuning outperform our contrastive learning approach\nwhen evaluated on both tasks. To gain a better understanding of the reasons\ncontrastive learning was not successful for music videos, we perform a\nqualitative analysis of the learned representations, revealing why contrastive\nlearning might have difficulties uniting embeddings from two modalities. Based\non these findings, we outline possible directions for future work. To\nfacilitate the reproducibility of our results, we share our code and the\npre-trained model.\n","authors":["Karel Veldkamp","Mariya Hendriksen","Zoltán Szlávik","Alexander Keijser"],"pdf_url":"https://arxiv.org/pdf/2309.00347v1.pdf","comment":"6 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.01525v2","updated":"2023-09-01T08:52:02Z","published":"2023-08-03T04:04:03Z","title":"VisAlign: Dataset for Measuring the Degree of Alignment between AI and\n Humans in Visual Perception","summary":" AI alignment refers to models acting towards human-intended goals,\npreferences, or ethical principles. Given that most large-scale deep learning\nmodels act as black boxes and cannot be manually controlled, analyzing the\nsimilarity between models and humans can be a proxy measure for ensuring AI\nsafety. In this paper, we focus on the models' visual perception alignment with\nhumans, further referred to as AI-human visual alignment. Specifically, we\npropose a new dataset for measuring AI-human visual alignment in terms of image\nclassification, a fundamental task in machine perception. In order to evaluate\nAI-human visual alignment, a dataset should encompass samples with various\nscenarios that may arise in the real world and have gold human perception\nlabels. Our dataset consists of three groups of samples, namely Must-Act (i.e.,\nMust-Classify), Must-Abstain, and Uncertain, based on the quantity and clarity\nof visual information in an image and further divided into eight categories.\nAll samples have a gold human perception label; even Uncertain (severely\nblurry) sample labels were obtained via crowd-sourcing. The validity of our\ndataset is verified by sampling theory, statistical theories related to survey\ndesign, and experts in the related fields. Using our dataset, we analyze the\nvisual alignment and reliability of five popular visual perception models and\nseven abstention methods. Our code and data is available at\n\\url{https://github.com/jiyounglee-0523/VisAlign}.\n","authors":["Jiyoung Lee","Seungho Kim","Seunghyun Won","Joonseok Lee","Marzyeh Ghassemi","James Thorne","Jaeseok Choi","O-Kil Kwon","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2308.01525v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00339v1","updated":"2023-09-01T08:47:52Z","published":"2023-09-01T08:47:52Z","title":"Robust Point Cloud Processing through Positional Embedding","summary":" End-to-end trained per-point embeddings are an essential ingredient of any\nstate-of-the-art 3D point cloud processing such as detection or alignment.\nMethods like PointNet, or the more recent point cloud transformer -- and its\nvariants -- all employ learned per-point embeddings. Despite impressive\nperformance, such approaches are sensitive to out-of-distribution (OOD) noise\nand outliers. In this paper, we explore the role of an analytical per-point\nembedding based on the criterion of bandwidth. The concept of bandwidth enables\nus to draw connections with an alternate per-point embedding -- positional\nembedding, particularly random Fourier features. We present compelling robust\nresults across downstream tasks such as point cloud classification and\nregistration with several categories of OOD noise.\n","authors":["Jianqiao Zheng","Xueqian Li","Sameera Ramasinghe","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2309.00339v1.pdf","comment":"18 pages, 13 figures, 5 tables"},{"id":"http://arxiv.org/abs/2309.00331v1","updated":"2023-09-01T08:35:24Z","published":"2023-09-01T08:35:24Z","title":"Human trajectory prediction using LSTM with Attention mechanism","summary":" In this paper, we propose a human trajectory prediction model that combines a\nLong Short-Term Memory (LSTM) network with an attention mechanism. To do that,\nwe use attention scores to determine which parts of the input data the model\nshould focus on when making predictions. Attention scores are calculated for\neach input feature, with a higher score indicating the greater significance of\nthat feature in predicting the output. Initially, these scores are determined\nfor the target human position, velocity, and their neighboring individual's\npositions and velocities. By using attention scores, our model can prioritize\nthe most relevant information in the input data and make more accurate\npredictions. We extract attention scores from our attention mechanism and\nintegrate them into the trajectory prediction module to predict human future\ntrajectories. To achieve this, we introduce a new neural layer that processes\nattention scores after extracting them and concatenates them with positional\ninformation. We evaluate our approach on the publicly available ETH and UCY\ndatasets and measure its performance using the final displacement error (FDE)\nand average displacement error (ADE) metrics. We show that our modified\nalgorithm performs better than the Social LSTM in predicting the future\ntrajectory of pedestrians in crowded spaces. Specifically, our model achieves\nan improvement of 6.2% in ADE and 6.3% in FDE compared to the Social LSTM\nresults in the literature.\n","authors":["Amin Manafi Soltan Ahmadi","Samaneh Hoseini Semnani"],"pdf_url":"https://arxiv.org/pdf/2309.00331v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.08242v3","updated":"2023-09-01T08:12:30Z","published":"2023-08-16T09:16:05Z","title":"Contrastive Learning for Lane Detection via Cross-Similarity","summary":" Detecting road lanes is challenging due to intricate markings vulnerable to\nunfavorable conditions. Lane markings have strong shape priors, but their\nvisibility is easily compromised. Factors like lighting, weather, vehicles,\npedestrians, and aging colors challenge the detection. A large amount of data\nis required to train a lane detection approach that can withstand natural\nvariations caused by low visibility. This is because there are numerous lane\nshapes and natural variations that exist. Our solution, Contrastive Learning\nfor Lane Detection via cross-similarity (CLLD), is a self-supervised learning\nmethod that tackles this challenge by enhancing lane detection models\nresilience to real-world conditions that cause lane low visibility. CLLD is a\nnovel multitask contrastive learning that trains lane detection approaches to\ndetect lane markings even in low visible situations by integrating local\nfeature contrastive learning (CL) with our new proposed operation\ncross-similarity. Local feature CL focuses on extracting features for small\nimage parts, which is necessary to localize lane segments, while\ncross-similarity captures global features to detect obscured lane segments\nusing their surrounding. We enhance cross-similarity by randomly masking parts\nof input images for augmentation. Evaluated on benchmark datasets, CLLD\noutperforms state-of-the-art contrastive learning, especially in\nvisibility-impairing conditions like shadows. Compared to supervised learning,\nCLLD excels in scenarios like shadows and crowded scenes.\n","authors":["Ali Zoljodi","Sadegh Abadijou","Mina Alibeigi","Masoud Daneshtalab"],"pdf_url":"https://arxiv.org/pdf/2308.08242v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.00442v2","updated":"2023-09-01T08:01:36Z","published":"2023-08-01T10:37:12Z","title":"FLatten Transformer: Vision Transformer using Focused Linear Attention","summary":" The quadratic computation complexity of self-attention has been a persistent\nchallenge when applying Transformer models to vision tasks. Linear attention,\non the other hand, offers a much more efficient alternative with its linear\ncomplexity by approximating the Softmax operation through carefully designed\nmapping functions. However, current linear attention approaches either suffer\nfrom significant performance degradation or introduce additional computation\noverhead from the mapping functions. In this paper, we propose a novel Focused\nLinear Attention module to achieve both high efficiency and expressiveness.\nSpecifically, we first analyze the factors contributing to the performance\ndegradation of linear attention from two perspectives: the focus ability and\nfeature diversity. To overcome these limitations, we introduce a simple yet\neffective mapping function and an efficient rank restoration module to enhance\nthe expressiveness of self-attention while maintaining low computation\ncomplexity. Extensive experiments show that our linear attention module is\napplicable to a variety of advanced vision Transformers, and achieves\nconsistently improved performances on multiple benchmarks. Code is available at\nhttps://github.com/LeapLabTHU/FLatten-Transformer.\n","authors":["Dongchen Han","Xuran Pan","Yizeng Han","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.00442v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.00314v1","updated":"2023-09-01T07:55:53Z","published":"2023-09-01T07:55:53Z","title":"ARFA: An Asymmetric Receptive Field Autoencoder Model for Spatiotemporal\n Prediction","summary":" Spatiotemporal prediction aims to generate future sequences by paradigms\nlearned from historical contexts. It holds significant importance in numerous\ndomains, including traffic flow prediction and weather forecasting. However,\nexisting methods face challenges in handling spatiotemporal correlations, as\nthey commonly adopt encoder and decoder architectures with identical receptive\nfields, which adversely affects prediction accuracy. This paper proposes an\nAsymmetric Receptive Field Autoencoder (ARFA) model to address this issue.\nSpecifically, we design corresponding sizes of receptive field modules tailored\nto the distinct functionalities of the encoder and decoder. In the encoder, we\nintroduce a large kernel module for global spatiotemporal feature extraction.\nIn the decoder, we develop a small kernel module for local spatiotemporal\ninformation reconstruction. To address the scarcity of meteorological\nprediction data, we constructed the RainBench, a large-scale radar echo dataset\nspecific to the unique precipitation characteristics of inland regions in China\nfor precipitation prediction. Experimental results demonstrate that ARFA\nachieves consistent state-of-the-art performance on two mainstream\nspatiotemporal prediction datasets and our RainBench dataset, affirming the\neffectiveness of our approach. This work not only explores a novel method from\nthe perspective of receptive fields but also provides data support for\nprecipitation prediction, thereby advancing future research in spatiotemporal\nprediction.\n","authors":["Wenxuan Zhang","Xuechao Zou","Li Wu","Jianqiang Huang","Xiaoying Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00314v1.pdf","comment":"0 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.00310v1","updated":"2023-09-01T07:52:08Z","published":"2023-09-01T07:52:08Z","title":"Fusing Monocular Images and Sparse IMU Signals for Real-time Human\n Motion Capture","summary":" Either RGB images or inertial signals have been used for the task of motion\ncapture (mocap), but combining them together is a new and interesting topic. We\nbelieve that the combination is complementary and able to solve the inherent\ndifficulties of using one modality input, including occlusions, extreme\nlighting/texture, and out-of-view for visual mocap and global drifts for\ninertial mocap. To this end, we propose a method that fuses monocular images\nand sparse IMUs for real-time human motion capture. Our method contains a dual\ncoordinate strategy to fully explore the IMU signals with different goals in\nmotion capture. To be specific, besides one branch transforming the IMU signals\nto the camera coordinate system to combine with the image information, there is\nanother branch to learn from the IMU signals in the body root coordinate system\nto better estimate body poses. Furthermore, a hidden state feedback mechanism\nis proposed for both two branches to compensate for their own drawbacks in\nextreme input cases. Thus our method can easily switch between the two kinds of\nsignals or combine them in different cases to achieve a robust mocap. %The two\ndivided parts can help each other for better mocap results under different\nconditions. Quantitative and qualitative results demonstrate that by delicately\ndesigning the fusion method, our technique significantly outperforms the\nstate-of-the-art vision, IMU, and combined methods on both global orientation\nand local pose estimation. Our codes are available for research at\nhttps://shaohua-pan.github.io/robustcap-page/.\n","authors":["Shaohua Pan","Qi Ma","Xinyu Yi","Weifeng Hu","Xiong Wang","Xingkang Zhou","Jijunnan Li","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2309.00310v1.pdf","comment":"Accepted by SIGGRAPH ASIA 2023. Project page:\n https://shaohua-pan.github.io/robustcap-page/"},{"id":"http://arxiv.org/abs/2309.00305v1","updated":"2023-09-01T07:29:44Z","published":"2023-09-01T07:29:44Z","title":"Efficient Surrogate Models for Materials Science Simulations: Machine\n Learning-based Prediction of Microstructure Properties","summary":" Determining, understanding, and predicting the so-called structure-property\nrelation is an important task in many scientific disciplines, such as\nchemistry, biology, meteorology, physics, engineering, and materials science.\nStructure refers to the spatial distribution of, e.g., substances, material, or\nmatter in general, while property is a resulting characteristic that usually\ndepends in a non-trivial way on spatial details of the structure.\nTraditionally, forward simulations models have been used for such tasks.\nRecently, several machine learning algorithms have been applied in these\nscientific fields to enhance and accelerate simulation models or as surrogate\nmodels. In this work, we develop and investigate the applications of six\nmachine learning techniques based on two different datasets from the domain of\nmaterials science: data from a two-dimensional Ising model for predicting the\nformation of magnetic domains and data representing the evolution of dual-phase\nmicrostructures from the Cahn-Hilliard model. We analyze the accuracy and\nrobustness of all models and elucidate the reasons for the differences in their\nperformances. The impact of including domain knowledge through tailored\nfeatures is studied, and general recommendations based on the availability and\nquality of training data are derived from this.\n","authors":["Binh Duong Nguyen","Pavlo Potapenko","Aytekin Dermici","Kishan Govinda","Stefan Sandfeld"],"pdf_url":"https://arxiv.org/pdf/2309.00305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02216v2","updated":"2023-09-01T07:26:08Z","published":"2023-04-05T04:07:54Z","title":"Industrial Anomaly Detection with Domain Shift: A Real-world Dataset and\n Masked Multi-scale Reconstruction","summary":" Industrial anomaly detection (IAD) is crucial for automating industrial\nquality inspection. The diversity of the datasets is the foundation for\ndeveloping comprehensive IAD algorithms. Existing IAD datasets focus on the\ndiversity of data categories, overlooking the diversity of domains within the\nsame data category. In this paper, to bridge this gap, we propose the\nAero-engine Blade Anomaly Detection (AeBAD) dataset, consisting of two\nsub-datasets: the single-blade dataset and the video anomaly detection dataset\nof blades. Compared to existing datasets, AeBAD has the following two\ncharacteristics: 1.) The target samples are not aligned and at different\nscales. 2.) There is a domain shift between the distribution of normal samples\nin the test set and the training set, where the domain shifts are mainly caused\nby the changes in illumination and view. Based on this dataset, we observe that\ncurrent state-of-the-art (SOTA) IAD methods exhibit limitations when the domain\nof normal samples in the test set undergoes a shift. To address this issue, we\npropose a novel method called masked multi-scale reconstruction (MMR), which\nenhances the model's capacity to deduce causality among patches in normal\nsamples by a masked reconstruction task. MMR achieves superior performance\ncompared to SOTA methods on the AeBAD dataset. Furthermore, MMR achieves\ncompetitive performance with SOTA methods to detect the anomalies of different\ntypes on the MVTec AD dataset. Code and dataset are available at\nhttps://github.com/zhangzilongc/MMR.\n","authors":["Zilong Zhang","Zhibin Zhao","Xingwu Zhang","Chuang Sun","Xuefeng Chen"],"pdf_url":"https://arxiv.org/pdf/2304.02216v2.pdf","comment":"Accept by Computers in Industry"},{"id":"http://arxiv.org/abs/2302.05629v2","updated":"2023-09-01T07:09:55Z","published":"2023-02-11T08:58:55Z","title":"Improving Differentiable Architecture Search via Self-Distillation","summary":" Differentiable Architecture Search (DARTS) is a simple yet efficient Neural\nArchitecture Search (NAS) method. During the search stage, DARTS trains a\nsupernet by jointly optimizing architecture parameters and network parameters.\nDuring the evaluation stage, DARTS discretizes the supernet to derive the\noptimal architecture based on architecture parameters. However, recent research\nhas shown that during the training process, the supernet tends to converge\ntowards sharp minima rather than flat minima. This is evidenced by the higher\nsharpness of the loss landscape of the supernet, which ultimately leads to a\nperformance gap between the supernet and the optimal architecture. In this\npaper, we propose Self-Distillation Differentiable Neural Architecture Search\n(SD-DARTS) to alleviate the discretization gap. We utilize self-distillation to\ndistill knowledge from previous steps of the supernet to guide its training in\nthe current step, effectively reducing the sharpness of the supernet's loss and\nbridging the performance gap between the supernet and the optimal architecture.\nFurthermore, we introduce the concept of voting teachers, where multiple\nprevious supernets are selected as teachers, and their output probabilities are\naggregated through voting to obtain the final teacher prediction. Experimental\nresults on real datasets demonstrate the advantages of our novel\nself-distillation-based NAS method compared to state-of-the-art alternatives.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2302.05629v2.pdf","comment":"Accepted by Neural Networks"},{"id":"http://arxiv.org/abs/2303.08440v2","updated":"2023-09-01T07:04:30Z","published":"2023-03-15T08:28:06Z","title":"Improving 3D Imaging with Pre-Trained Perpendicular 2D Diffusion Models","summary":" Diffusion models have become a popular approach for image generation and\nreconstruction due to their numerous advantages. However, most diffusion-based\ninverse problem-solving methods only deal with 2D images, and even recently\npublished 3D methods do not fully exploit the 3D distribution prior. To address\nthis, we propose a novel approach using two perpendicular pre-trained 2D\ndiffusion models to solve the 3D inverse problem. By modeling the 3D data\ndistribution as a product of 2D distributions sliced in different directions,\nour method effectively addresses the curse of dimensionality. Our experimental\nresults demonstrate that our method is highly effective for 3D medical image\nreconstruction tasks, including MRI Z-axis super-resolution, compressed sensing\nMRI, and sparse-view CT. Our method can generate high-quality voxel volumes\nsuitable for medical applications.\n","authors":["Suhyeon Lee","Hyungjin Chung","Minyoung Park","Jonghyuk Park","Wi-Sun Ryu","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2303.08440v2.pdf","comment":"ICCV23 poster. 15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.00297v1","updated":"2023-09-01T07:03:27Z","published":"2023-09-01T07:03:27Z","title":"Fine-Grained Spatiotemporal Motion Alignment for Contrastive Video\n Representation Learning","summary":" As the most essential property in a video, motion information is critical to\na robust and generalized video representation. To inject motion dynamics,\nrecent works have adopted frame difference as the source of motion information\nin video contrastive learning, considering the trade-off between quality and\ncost. However, existing works align motion features at the instance level,\nwhich suffers from spatial and temporal weak alignment across modalities. In\nthis paper, we present a \\textbf{Fi}ne-grained \\textbf{M}otion\n\\textbf{A}lignment (FIMA) framework, capable of introducing well-aligned and\nsignificant motion information. Specifically, we first develop a dense\ncontrastive learning framework in the spatiotemporal domain to generate\npixel-level motion supervision. Then, we design a motion decoder and a\nforeground sampling strategy to eliminate the weak alignments in terms of time\nand space. Moreover, a frame-level motion contrastive loss is presented to\nimprove the temporal diversity of the motion features. Extensive experiments\ndemonstrate that the representations learned by FIMA possess great\nmotion-awareness capabilities and achieve state-of-the-art or competitive\nresults on downstream tasks across UCF101, HMDB51, and Diving48 datasets. Code\nis available at \\url{https://github.com/ZMHH-H/FIMA}.\n","authors":["Minghao Zhu","Xiao Lin","Ronghao Dang","Chengju Liu","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2309.00297v1.pdf","comment":"ACM MM 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2309.00287v1","updated":"2023-09-01T06:47:13Z","published":"2023-09-01T06:47:13Z","title":"Fast Diffusion EM: a diffusion model for blind inverse problems with\n application to deconvolution","summary":" Using diffusion models to solve inverse problems is a growing field of\nresearch. Current methods assume the degradation to be known and provide\nimpressive results in terms of restoration quality and diversity. In this work,\nwe leverage the efficiency of those models to jointly estimate the restored\nimage and unknown parameters of the degradation model. In particular, we\ndesigned an algorithm based on the well-known Expectation-Minimization (EM)\nestimation method and diffusion models. Our method alternates between\napproximating the expected log-likelihood of the inverse problem using samples\ndrawn from a diffusion model and a maximization step to estimate unknown model\nparameters. For the maximization step, we also introduce a novel blur kernel\nregularization based on a Plug \\& Play denoiser. Diffusion models are long to\nrun, thus we provide a fast version of our algorithm. Extensive experiments on\nblind image deblurring demonstrate the effectiveness of our method when\ncompared to other state-of-the-art approaches.\n","authors":["Charles Laroche","Andrés Almansa","Eva Coupete"],"pdf_url":"https://arxiv.org/pdf/2309.00287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13970v2","updated":"2023-09-01T06:42:53Z","published":"2023-08-26T22:54:45Z","title":"FAM: fast adaptive federated meta-learning","summary":" In this work, we propose a fast adaptive federated meta-learning (FAM)\nframework for collaboratively learning a single global model, which can then be\npersonalized locally on individual clients. Federated learning enables multiple\nclients to collaborate to train a model without sharing data. Clients with\ninsufficient data or data diversity participate in federated learning to learn\na model with superior performance. Nonetheless, learning suffers when data\ndistributions diverge. There is a need to learn a global model that can be\nadapted using client's specific information to create personalized models on\nclients is required. MRI data suffers from this problem, wherein, one, due to\ndata acquisition challenges, local data at a site is sufficient for training an\naccurate model and two, there is a restriction of data sharing due to privacy\nconcerns and three, there is a need for personalization of a learnt shared\nglobal model on account of domain shift across client sites. The global model\nis sparse and captures the common features in the MRI. This skeleton network is\ngrown on each client to train a personalized model by learning additional\nclient-specific parameters from local data. Experimental results show that the\npersonalization process at each client quickly converges using a limited number\nof epochs. The personalized client models outperformed the locally trained\nmodels, demonstrating the efficacy of the FAM mechanism. Additionally, the\nsparse parameter set to be communicated during federated learning drastically\nreduced communication overhead, which makes the scheme viable for networks with\nlimited resources.\n","authors":["Indrajeet Kumar Sinha","Shekhar Verma","Krishna Pratap Singh"],"pdf_url":"https://arxiv.org/pdf/2308.13970v2.pdf","comment":"13 Pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.00277v1","updated":"2023-09-01T06:21:02Z","published":"2023-09-01T06:21:02Z","title":"SparseSat-NeRF: Dense Depth Supervised Neural Radiance Fields for Sparse\n Satellite Images","summary":" Digital surface model generation using traditional multi-view stereo matching\n(MVS) performs poorly over non-Lambertian surfaces, with asynchronous\nacquisitions, or at discontinuities. Neural radiance fields (NeRF) offer a new\nparadigm for reconstructing surface geometries using continuous volumetric\nrepresentation. NeRF is self-supervised, does not require ground truth geometry\nfor training, and provides an elegant way to include in its representation\nphysical parameters about the scene, thus potentially remedying the challenging\nscenarios where MVS fails. However, NeRF and its variants require many views to\nproduce convincing scene's geometries which in earth observation satellite\nimaging is rare. In this paper we present SparseSat-NeRF (SpS-NeRF) - an\nextension of Sat-NeRF adapted to sparse satellite views. SpS-NeRF employs dense\ndepth supervision guided by crosscorrelation similarity metric provided by\ntraditional semi-global MVS matching. We demonstrate the effectiveness of our\napproach on stereo and tri-stereo Pleiades 1B/WorldView-3 images, and compare\nagainst NeRF and Sat-NeRF. The code is available at\nhttps://github.com/LulinZhang/SpS-NeRF\n","authors":["Lulin Zhang","Ewelina Rupnik"],"pdf_url":"https://arxiv.org/pdf/2309.00277v1.pdf","comment":"ISPRS Annals 2023"},{"id":"http://arxiv.org/abs/2308.16777v2","updated":"2023-09-01T05:57:47Z","published":"2023-08-31T14:55:30Z","title":"Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models","summary":" Zero-shot referring image segmentation is a challenging task because it aims\nto find an instance segmentation mask based on the given referring\ndescriptions, without training on this type of paired data. Current zero-shot\nmethods mainly focus on using pre-trained discriminative models (e.g., CLIP).\nHowever, we have observed that generative models (e.g., Stable Diffusion) have\npotentially understood the relationships between various visual elements and\ntext descriptions, which are rarely investigated in this task. In this work, we\nintroduce a novel Referring Diffusional segmentor (Ref-Diff) for this task,\nwhich leverages the fine-grained multi-modal information from generative\nmodels. We demonstrate that without a proposal generator, a generative model\nalone can achieve comparable performance to existing SOTA weakly-supervised\nmodels. When we combine both generative and discriminative models, our Ref-Diff\noutperforms these competing methods by a significant margin. This indicates\nthat generative models are also beneficial for this task and can complement\ndiscriminative models for better referring segmentation. Our code is publicly\navailable at https://github.com/kodenii/Ref-Diff.\n","authors":["Minheng Ni","Yabo Zhang","Kailai Feng","Xiaoming Li","Yiwen Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.16777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00265v1","updated":"2023-09-01T05:50:47Z","published":"2023-09-01T05:50:47Z","title":"Application of Machine Learning in Melanoma Detection and the\n Identification of 'Ugly Duckling' and Suspicious Naevi: A Review","summary":" Skin lesions known as naevi exhibit diverse characteristics such as size,\nshape, and colouration. The concept of an \"Ugly Duckling Naevus\" comes into\nplay when monitoring for melanoma, referring to a lesion with distinctive\nfeatures that sets it apart from other lesions in the vicinity. As lesions\nwithin the same individual typically share similarities and follow a\npredictable pattern, an ugly duckling naevus stands out as unusual and may\nindicate the presence of a cancerous melanoma. Computer-aided diagnosis (CAD)\nhas become a significant player in the research and development field, as it\ncombines machine learning techniques with a variety of patient analysis\nmethods. Its aim is to increase accuracy and simplify decision-making, all\nwhile responding to the shortage of specialized professionals. These automated\nsystems are especially important in skin cancer diagnosis where specialist\navailability is limited. As a result, their use could lead to life-saving\nbenefits and cost reductions within healthcare. Given the drastic change in\nsurvival when comparing early stage to late-stage melanoma, early detection is\nvital for effective treatment and patient outcomes. Machine learning (ML) and\ndeep learning (DL) techniques have gained popularity in skin cancer\nclassification, effectively addressing challenges, and providing results\nequivalent to that of specialists. This article extensively covers modern\nMachine Learning and Deep Learning algorithms for detecting melanoma and\nsuspicious naevi. It begins with general information on skin cancer and\ndifferent types of naevi, then introduces AI, ML, DL, and CAD. The article then\ndiscusses the successful applications of various ML techniques like\nconvolutional neural networks (CNN) for melanoma detection compared to\ndermatologists' performance. Lastly, it examines ML methods for UD naevus\ndetection and identifying suspicious naevi.\n","authors":["Fatima Al Zegair","Nathasha Naranpanawa","Brigid Betz-Stablein","Monika Janda","H. Peter Soyer","Shekhar S. Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.00265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10079v2","updated":"2023-09-01T05:28:31Z","published":"2023-08-19T17:59:12Z","title":"MeDM: Mediating Image Diffusion Models for Video-to-Video Translation\n with Temporal Correspondence Guidance","summary":" This study introduces an efficient and effective method, MeDM, that utilizes\npre-trained image Diffusion Models for video-to-video translation with\nconsistent temporal flow. The proposed framework can render videos from scene\nposition information, such as a normal G-buffer, or perform text-guided editing\non videos captured in real-world scenarios. We employ explicit optical flows to\nconstruct a practical coding that enforces physical constraints on generated\nframes and mediates independent frame-wise scores. By leveraging this coding,\nmaintaining temporal consistency in the generated videos can be framed as an\noptimization problem with a closed-form solution. To ensure compatibility with\nStable Diffusion, we also suggest a workaround for modifying observed-space\nscores in latent-space Diffusion Models. Notably, MeDM does not require\nfine-tuning or test-time optimization of the Diffusion Models. Through\nextensive qualitative, quantitative, and subjective experiments on various\nbenchmarks, the study demonstrates the effectiveness and superiority of the\nproposed approach. Project page can be found at https://medm2023.github.io\n","authors":["Ernie Chu","Tzuhsuan Huang","Shuo-Yen Lin","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.10079v2.pdf","comment":"Project page: https://medm2023.github.io"},{"id":"http://arxiv.org/abs/2211.13854v2","updated":"2023-09-01T05:07:18Z","published":"2022-11-25T01:37:48Z","title":"ComCLIP: Training-Free Compositional Image and Text Matching","summary":" Contrastive Language-Image Pretraining (CLIP) has demonstrated great\nzero-shot performance for matching images and text. However, it is still\nchallenging to adapt vision-lanaguage pretrained models like CLIP to\ncompositional image and text matching -- a more challenging image and text\nmatching task requiring the model understanding of compositional word concepts\nand visual components. Towards better compositional generalization in zero-shot\nimage and text matching, in this paper, we study the problem from a causal\nperspective: the erroneous semantics of individual entities are essentially\nconfounders that cause the matching failure. Therefore, we propose a novel\n\\textbf{\\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP\ndisentangles input images into subjects, objects, and action sub-images and\ncomposes CLIP's vision encoder and text encoder to perform evolving matching\nover compositional text embedding and sub-image embeddings. In this way,\nComCLIP can mitigate spurious correlations introduced by the pretrained CLIP\nmodels and dynamically evaluate the importance of each component. Experiments\non four compositional image-text matching datasets: SVO, ComVG, Winoground, and\nVL-checklist, and two general image-text retrieval datasets: Flick30K, and\nMSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts\nthe \\textbf{\\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even\nwithout further training or fine-tuning.\n","authors":["Kenan Jiang","Xuehai He","Ruize Xu","Xin Eric Wang"],"pdf_url":"https://arxiv.org/pdf/2211.13854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00252v1","updated":"2023-09-01T05:01:52Z","published":"2023-09-01T05:01:52Z","title":"Interpretable Medical Imagery Diagnosis with Self-Attentive\n Transformers: A Review of Explainable AI for Health Care","summary":" Recent advancements in artificial intelligence (AI) have facilitated its\nwidespread adoption in primary medical services, addressing the demand-supply\nimbalance in healthcare. Vision Transformers (ViT) have emerged as\nstate-of-the-art computer vision models, benefiting from self-attention\nmodules. However, compared to traditional machine-learning approaches,\ndeep-learning models are complex and are often treated as a \"black box\" that\ncan cause uncertainty regarding how they operate. Explainable Artificial\nIntelligence (XAI) refers to methods that explain and interpret machine\nlearning models' inner workings and how they come to decisions, which is\nespecially important in the medical domain to guide the healthcare\ndecision-making process. This review summarises recent ViT advancements and\ninterpretative approaches to understanding the decision-making process of ViT,\nenabling transparency in medical diagnosis applications.\n","authors":["Tin Lai"],"pdf_url":"https://arxiv.org/pdf/2309.00252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03998v3","updated":"2023-09-01T04:52:01Z","published":"2023-08-08T02:28:48Z","title":"Real-time Strawberry Detection Based on Improved YOLOv5s Architecture\n for Robotic Harvesting in open-field environment","summary":" This study proposed a YOLOv5-based custom object detection model to detect\nstrawberries in an outdoor environment. The original architecture of the\nYOLOv5s was modified by replacing the C3 module with the C2f module in the\nbackbone network, which provided a better feature gradient flow. Secondly, the\nSpatial Pyramid Pooling Fast in the final layer of the backbone network of\nYOLOv5s was combined with Cross Stage Partial Net to improve the generalization\nability over the strawberry dataset in this study. The proposed architecture\nwas named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with\nthree maturity classes (immature, nearly mature, and mature) was collected in\nopen-field environment and augmented through a series of operations including\nbrightness reduction, brightness increase, and noise adding. To verify the\nsuperiority of the proposed method for strawberry detection in open-field\nenvironment, four competitive detection models (YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational\nenvironment and compared with YOLOv5s-Straw. The results showed that the\nhighest mean average precision of 80.3% was achieved using the proposed\narchitecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s,\nYOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively.\nSpecifically, the average precision of YOLOv5s-Straw was 82.1% in the immature\nclass, 73.5% in the nearly mature class, and 86.6% in the mature class, which\nwere 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The\nmodel included 8.6*10^6 network parameters with an inference speed of 18ms per\nimage while the inference speed of YOLOv8s had a slower inference speed of\n21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed\nmodel is fast enough for real time strawberry detection and localization for\nthe robotic picking.\n","authors":["Zixuan He","Salik Ram Khanal","Xin Zhang","Manoj Karkee","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03998v3.pdf","comment":"20 pages; 15 figures"},{"id":"http://arxiv.org/abs/2308.12538v2","updated":"2023-09-01T04:51:13Z","published":"2023-08-24T03:50:37Z","title":"Mutual-Guided Dynamic Network for Image Fusion","summary":" Image fusion aims to generate a high-quality image from multiple images\ncaptured under varying conditions. The key problem of this task is to preserve\ncomplementary information while filtering out irrelevant information for the\nfused result. However, existing methods address this problem by leveraging\nstatic convolutional neural networks (CNNs), suffering two inherent limitations\nduring feature extraction, i.e., being unable to handle spatial-variant\ncontents and lacking guidance from multiple inputs. In this paper, we propose a\nnovel mutual-guided dynamic network (MGDN) for image fusion, which allows for\neffective information utilization across different locations and inputs.\nSpecifically, we design a mutual-guided dynamic filter (MGDF) for adaptive\nfeature extraction, composed of a mutual-guided cross-attention (MGCA) module\nand a dynamic filter predictor, where the former incorporates additional\nguidance from different inputs and the latter generates spatial-variant kernels\nfor different locations. In addition, we introduce a parallel feature fusion\n(PFF) module to effectively fuse local and global information of the extracted\nfeatures. To further reduce the redundancy among the extracted features while\nsimultaneously preserving their shared structural information, we devise a\nnovel loss function that combines the minimization of normalized mutual\ninformation (NMI) with an estimated gradient mask. Experimental results on five\nbenchmark datasets demonstrate that our proposed method outperforms existing\nmethods on four image fusion tasks. The code and model are publicly available\nat: https://github.com/Guanys-dar/MGDN.\n","authors":["Yuanshen Guan","Ruikang Xu","Mingde Yao","Lizhi Wang","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.12538v2.pdf","comment":"ACMMM 2023 accepted"},{"id":"http://arxiv.org/abs/2309.00248v1","updated":"2023-09-01T04:42:03Z","published":"2023-09-01T04:42:03Z","title":"DiffuGen: Adaptable Approach for Generating Labeled Image Datasets using\n Stable Diffusion Models","summary":" Generating high-quality labeled image datasets is crucial for training\naccurate and robust machine learning models in the field of computer vision.\nHowever, the process of manually labeling real images is often time-consuming\nand costly. To address these challenges associated with dataset generation, we\nintroduce \"DiffuGen,\" a simple and adaptable approach that harnesses the power\nof stable diffusion models to create labeled image datasets efficiently. By\nleveraging stable diffusion models, our approach not only ensures the quality\nof generated datasets but also provides a versatile solution for label\ngeneration. In this paper, we present the methodology behind DiffuGen, which\ncombines the capabilities of diffusion models with two distinct labeling\ntechniques: unsupervised and supervised. Distinctively, DiffuGen employs prompt\ntemplating for adaptable image generation and textual inversion to enhance\ndiffusion model capabilities.\n","authors":["Michael Shenoda","Edward Kim"],"pdf_url":"https://arxiv.org/pdf/2309.00248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1908.11314v5","updated":"2023-09-01T04:37:22Z","published":"2019-08-29T15:54:06Z","title":"Variational Denoising Network: Toward Blind Noise Modeling and Removal","summary":" Blind image denoising is an important yet very challenging problem in\ncomputer vision due to the complicated acquisition process of real images. In\nthis work we propose a new variational inference method, which integrates both\nnoise estimation and image denoising into a unique Bayesian framework, for\nblind image denoising. Specifically, an approximate posterior, parameterized by\ndeep neural networks, is presented by taking the intrinsic clean image and\nnoise variances as latent variables conditioned on the input noisy image. This\nposterior provides explicit parametric forms for all its involved\nhyper-parameters, and thus can be easily implemented for blind image denoising\nwith automatic noise estimation for the test noisy image. On one hand, as other\ndata-driven deep learning methods, our method, namely variational denoising\nnetwork (VDN), can perform denoising efficiently due to its explicit form of\nposterior expression. On the other hand, VDN inherits the advantages of\ntraditional model-driven approaches, especially the good generalization\ncapability of generative models. VDN has good interpretability and can be\nflexibly utilized to estimate and remove complicated non-i.i.d. noise collected\nin real scenarios. Comprehensive experiments are performed to substantiate the\nsuperiority of our method in blind image denoising.\n","authors":["Zongsheng Yue","Hongwei Yong","Qian Zhao","Lei Zhang","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/1908.11314v5.pdf","comment":"Correct a minor typo"},{"id":"http://arxiv.org/abs/2304.02013v2","updated":"2023-09-01T04:20:25Z","published":"2023-04-04T17:59:22Z","title":"NPC: Neural Point Characters from Video","summary":" High-fidelity human 3D models can now be learned directly from videos,\ntypically by combining a template-based surface model with neural\nrepresentations. However, obtaining a template surface requires expensive\nmulti-view capture systems, laser scans, or strictly controlled conditions.\nPrevious methods avoid using a template but rely on a costly or ill-posed\nmapping from observation to canonical space. We propose a hybrid point-based\nrepresentation for reconstructing animatable characters that does not require\nan explicit surface model, while being generalizable to novel poses. For a\ngiven video, our method automatically produces an explicit set of 3D points\nrepresenting approximate canonical geometry, and learns an articulated\ndeformation model that produces pose-dependent point transformations. The\npoints serve both as a scaffold for high-frequency neural features and an\nanchor for efficiently mapping between observation and canonical space. We\ndemonstrate on established benchmarks that our representation overcomes\nlimitations of prior work operating in either canonical or in observation\nspace. Moreover, our automatic point extraction approach enables learning\nmodels of human and animal characters alike, matching the performance of the\nmethods using rigged surface templates despite being more general. Project\nwebsite: https://lemonatsu.github.io/npc/\n","authors":["Shih-Yang Su","Timur Bagautdinov","Helge Rhodin"],"pdf_url":"https://arxiv.org/pdf/2304.02013v2.pdf","comment":"Project website: https://lemonatsu.github.io/npc/"},{"id":"http://arxiv.org/abs/2308.14221v3","updated":"2023-09-01T04:16:20Z","published":"2023-08-27T22:45:24Z","title":"High-Resolution Document Shadow Removal via A Large-Scale Real-World\n Dataset and A Frequency-Aware Shadow Erasing Net","summary":" Shadows often occur when we capture the documents with casual equipment,\nwhich influences the visual quality and readability of the digital copies.\nDifferent from the algorithms for natural shadow removal, the algorithms in\ndocument shadow removal need to preserve the details of fonts and figures in\nhigh-resolution input. Previous works ignore this problem and remove the\nshadows via approximate attention and small datasets, which might not work in\nreal-world situations. We handle high-resolution document shadow removal\ndirectly via a larger-scale real-world dataset and a carefully designed\nfrequency-aware network. As for the dataset, we acquire over 7k couples of\nhigh-resolution (2462 x 3699) images of real-world document pairs with various\nsamples under different lighting circumstances, which is 10 times larger than\nexisting datasets. As for the design of the network, we decouple the\nhigh-resolution images in the frequency domain, where the low-frequency details\nand high-frequency boundaries can be effectively learned via the carefully\ndesigned network structure. Powered by our network and dataset, the proposed\nmethod clearly shows a better performance than previous methods in terms of\nvisual quality and numerical results. The code, models, and dataset are\navailable at: https://github.com/CXH-Research/DocShadow-SD7K\n","authors":["Zinuo Li","Xuhang Chen","Chi-Man Pun","Xiaodong Cun"],"pdf_url":"https://arxiv.org/pdf/2308.14221v3.pdf","comment":"Accepted by International Conference on Computer Vision 2023 (ICCV\n 2023)"},{"id":"http://arxiv.org/abs/2304.05961v2","updated":"2023-09-01T04:09:37Z","published":"2023-04-12T16:32:34Z","title":"SpectralDiff: A Generative Framework for Hyperspectral Image\n Classification with Diffusion Models","summary":" Hyperspectral Image (HSI) classification is an important issue in remote\nsensing field with extensive applications in earth science. In recent years, a\nlarge number of deep learning-based HSI classification methods have been\nproposed. However, existing methods have limited ability to handle\nhigh-dimensional, highly redundant, and complex data, making it challenging to\ncapture the spectral-spatial distributions of data and relationships between\nsamples. To address this issue, we propose a generative framework for HSI\nclassification with diffusion models (SpectralDiff) that effectively mines the\ndistribution information of high-dimensional and highly redundant data by\niteratively denoising and explicitly constructing the data generation process,\nthus better reflecting the relationships between samples. The framework\nconsists of a spectral-spatial diffusion module, and an attention-based\nclassification module. The spectral-spatial diffusion module adopts forward and\nreverse spectral-spatial diffusion processes to achieve adaptive construction\nof sample relationships without requiring prior knowledge of graphical\nstructure or neighborhood information. It captures spectral-spatial\ndistribution and contextual information of objects in HSI and mines\nunsupervised spectral-spatial diffusion features within the reverse diffusion\nprocess. Finally, these features are fed into the attention-based\nclassification module for per-pixel classification. The diffusion features can\nfacilitate cross-sample perception via reconstruction distribution, leading to\nimproved classification performance. Experiments on three public HSI datasets\ndemonstrate that the proposed method can achieve better performance than\nstate-of-the-art methods. For the sake of reproducibility, the source code of\nSpectralDiff will be publicly available at\nhttps://github.com/chenning0115/SpectralDiff.\n","authors":["Ning Chen","Jun Yue","Leyuan Fang","Shaobo Xia"],"pdf_url":"https://arxiv.org/pdf/2304.05961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16258v2","updated":"2023-09-01T03:50:47Z","published":"2023-08-30T18:31:51Z","title":"Robust Principles: Architectural Design Principles for Adversarially\n Robust CNNs","summary":" Our research aims to unify existing works' diverging opinions on how\narchitectural components affect the adversarial robustness of CNNs. To\naccomplish our goal, we synthesize a suite of three generalizable robust\narchitectural design principles: (a) optimal range for depth and width\nconfigurations, (b) preferring convolutional over patchify stem stage, and (c)\nrobust residual block design through adopting squeeze and excitation blocks and\nnon-parametric smooth activation functions. Through extensive experiments\nacross a wide spectrum of dataset scales, adversarial training methods, model\nparameters, and network design spaces, our principles consistently and markedly\nimprove AutoAttack accuracy: 1-3 percentage points (pp) on CIFAR-10 and\nCIFAR-100, and 4-9 pp on ImageNet. The code is publicly available at\nhttps://github.com/poloclub/robust-principles.\n","authors":["ShengYun Peng","Weilin Xu","Cory Cornelius","Matthew Hull","Kevin Li","Rahul Duggal","Mansi Phute","Jason Martin","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2308.16258v2.pdf","comment":"Published at BMVC'23"},{"id":"http://arxiv.org/abs/2309.00233v1","updated":"2023-09-01T03:34:12Z","published":"2023-09-01T03:34:12Z","title":"Object-Centric Multiple Object Tracking","summary":" Unsupervised object-centric learning methods allow the partitioning of scenes\ninto entities without additional localization information and are excellent\ncandidates for reducing the annotation burden of multiple-object tracking (MOT)\npipelines. Unfortunately, they lack two key properties: objects are often split\ninto parts and are not consistently tracked over time. In fact,\nstate-of-the-art models achieve pixel-level accuracy and temporal consistency\nby relying on supervised object detection with additional ID labels for the\nassociation through time. This paper proposes a video object-centric model for\nMOT. It consists of an index-merge module that adapts the object-centric slots\ninto detection outputs and an object memory module that builds complete object\nprototypes to handle occlusions. Benefited from object-centric learning, we\nonly require sparse detection labels (0%-6.25%) for object localization and\nfeature binding. Relying on our self-supervised\nExpectation-Maximization-inspired loss for object association, our approach\nrequires no ID labels. Our experiments significantly narrow the gap between the\nexisting object-centric model and the fully supervised state-of-the-art and\noutperform several unsupervised trackers.\n","authors":["Zixu Zhao","Jiaze Wang","Max Horn","Yizhuo Ding","Tong He","Zechen Bai","Dominik Zietlow","Carl-Johann Simon-Gabriel","Bing Shuai","Zhuowen Tu","Thomas Brox","Bernt Schiele","Yanwei Fu","Francesco Locatello","Zheng Zhang","Tianjun Xiao"],"pdf_url":"https://arxiv.org/pdf/2309.00233v1.pdf","comment":"ICCV 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2306.08075v2","updated":"2023-09-01T03:30:19Z","published":"2023-06-13T18:45:32Z","title":"BPKD: Boundary Privileged Knowledge Distillation For Semantic\n Segmentation","summary":" Current knowledge distillation approaches in semantic segmentation tend to\nadopt a holistic approach that treats all spatial locations equally. However,\nfor dense prediction, students' predictions on edge regions are highly\nuncertain due to contextual information leakage, requiring higher spatial\nsensitivity knowledge than the body regions. To address this challenge, this\npaper proposes a novel approach called boundary-privileged knowledge\ndistillation (BPKD). BPKD distills the knowledge of the teacher model's body\nand edges separately to the compact student model. Specifically, we employ two\ndistinct loss functions: (i) edge loss, which aims to distinguish between\nambiguous classes at the pixel level in edge regions; (ii) body loss, which\nutilizes shape constraints and selectively attends to the inner-semantic\nregions. Our experiments demonstrate that the proposed BPKD method provides\nextensive refinements and aggregation for edge and body regions. Additionally,\nthe method achieves state-of-the-art distillation performance for semantic\nsegmentation on three popular benchmark datasets, highlighting its\neffectiveness and generalization ability. BPKD shows consistent improvements\nacross a diverse array of lightweight segmentation structures, including both\nCNNs and transformers, underscoring its architecture-agnostic adaptability. The\ncode is available at \\url{https://github.com/AkideLiu/BPKD}.\n","authors":["Liyang Liu","Zihan Wang","Minh Hieu Phan","Bowen Zhang","Jinchao Ge","Yifan Liu"],"pdf_url":"https://arxiv.org/pdf/2306.08075v2.pdf","comment":"17 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2308.16477v2","updated":"2023-09-01T03:14:03Z","published":"2023-08-31T05:43:46Z","title":"PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction","summary":" Vectorized high-definition map online construction has garnered considerable\nattention in the field of autonomous driving research. Most existing approaches\nmodel changeable map elements using a fixed number of points, or predict local\nmaps in a two-stage autoregressive manner, which may miss essential details and\nlead to error accumulation. Towards precise map element learning, we propose a\nsimple yet effective architecture named PivotNet, which adopts unified\npivot-based map representations and is formulated as a direct set prediction\nparadigm. Concretely, we first propose a novel point-to-line mask module to\nencode both the subordinate and geometrical point-line priors in the network.\nThen, a well-designed pivot dynamic matching module is proposed to model the\ntopology in dynamic point sequences by introducing the concept of sequence\nmatching. Furthermore, to supervise the position and topology of the vectorized\npoint predictions, we propose a dynamic vectorized sequence loss. Extensive\nexperiments and ablations show that PivotNet is remarkably superior to other\nSOTAs by 5.9 mAP at least. The code will be available soon.\n","authors":["Wenjie Ding","Limeng Qiao","Xi Qiu","Chi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16477v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2309.00227v1","updated":"2023-09-01T03:03:50Z","published":"2023-09-01T03:03:50Z","title":"What Makes Good Open-Vocabulary Detector: A Disassembling Perspective","summary":" Open-vocabulary detection (OVD) is a new object detection paradigm, aiming to\nlocalize and recognize unseen objects defined by an unbounded vocabulary. This\nis challenging since traditional detectors can only learn from pre-defined\ncategories and thus fail to detect and localize objects out of pre-defined\nvocabulary. To handle the challenge, OVD leverages pre-trained cross-modal VLM,\nsuch as CLIP, ALIGN, etc. Previous works mainly focus on the open vocabulary\nclassification part, with less attention on the localization part. We argue\nthat for a good OVD detector, both classification and localization should be\nparallelly studied for the novel object categories. We show in this work that\nimproving localization as well as cross-modal classification complement each\nother, and compose a good OVD detector jointly. We analyze three families of\nOVD methods with different design emphases. We first propose a vanilla\nmethod,i.e., cropping a bounding box obtained by a localizer and resizing it\ninto the CLIP. We next introduce another approach, which combines a standard\ntwo-stage object detector with CLIP. A two-stage object detector includes a\nvisual backbone, a region proposal network (RPN), and a region of interest\n(RoI) head. We decouple RPN and ROI head (DRR) and use RoIAlign to extract\nmeaningful features. In this case, it avoids resizing objects. To further\naccelerate the training time and reduce the model parameters, we couple RPN and\nROI head (CRR) as the third approach. We conduct extensive experiments on these\nthree types of approaches in different settings. On the OVD-COCO benchmark, DRR\nobtains the best performance and achieves 35.8 Novel AP$_{50}$, an absolute 2.8\ngain over the previous state-of-the-art (SOTA). For OVD-LVIS, DRR surpasses the\nprevious SOTA by 1.9 AP$_{50}$ in rare categories. We also provide an object\ndetection dataset called PID and provide a baseline on PID.\n","authors":["Jincheng Li","Chunyu Xie","Xiaoyu Wu","Bin Wang","Dawei Leng"],"pdf_url":"https://arxiv.org/pdf/2309.00227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16466v2","updated":"2023-09-01T02:35:11Z","published":"2023-08-31T05:20:48Z","title":"Self-Sampling Meta SAM: Enhancing Few-shot Medical Image Segmentation\n with Meta-Learning","summary":" While the Segment Anything Model (SAM) excels in semantic segmentation for\ngeneral-purpose images, its performance significantly deteriorates when applied\nto medical images, primarily attributable to insufficient representation of\nmedical images in its training dataset. Nonetheless, gathering comprehensive\ndatasets and training models that are universally applicable is particularly\nchallenging due to the long-tail problem common in medical images. To address\nthis gap, here we present a Self-Sampling Meta SAM (SSM-SAM) framework for\nfew-shot medical image segmentation. Our innovation lies in the design of three\nkey modules: 1) An online fast gradient descent optimizer, further optimized by\na meta-learner, which ensures swift and robust adaptation to new tasks. 2) A\nSelf-Sampling module designed to provide well-aligned visual prompts for\nimproved attention allocation; and 3) A robust attention-based decoder\nspecifically designed for medical few-shot learning to capture relationship\nbetween different slices. Extensive experiments on a popular abdominal CT\ndataset and an MRI dataset demonstrate that the proposed method achieves\nsignificant improvements over state-of-the-art methods in few-shot\nsegmentation, with an average improvements of 10.21% and 1.80% in terms of DSC,\nrespectively. In conclusion, we present a novel approach for rapid online\nadaptation in interactive image segmentation, adapting to a new organ in just\n0.83 minutes. Code is publicly available on GitHub upon acceptance.\n","authors":["Yiming Zhang","Tianang Leng","Kun Han","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2308.16466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13925v4","updated":"2023-09-01T02:33:26Z","published":"2023-07-26T02:46:50Z","title":"EasyNet: An Easy Network for 3D Industrial Anomaly Detection","summary":" 3D anomaly detection is an emerging and vital computer vision task in\nindustrial manufacturing (IM). Recently many advanced algorithms have been\npublished, but most of them cannot meet the needs of IM. There are several\ndisadvantages: i) difficult to deploy on production lines since their\nalgorithms heavily rely on large pre-trained models; ii) hugely increase\nstorage overhead due to overuse of memory banks; iii) the inference speed\ncannot be achieved in real-time. To overcome these issues, we propose an easy\nand deployment-friendly network (called EasyNet) without using pre-trained\nmodels and memory banks: firstly, we design a multi-scale multi-modality\nfeature encoder-decoder to accurately reconstruct the segmentation maps of\nanomalous regions and encourage the interaction between RGB images and depth\nimages; secondly, we adopt a multi-modality anomaly segmentation network to\nachieve a precise anomaly map; thirdly, we propose an attention-based\ninformation entropy fusion module for feature fusion during inference, making\nit suitable for real-time deployment. Extensive experiments show that EasyNet\nachieves an anomaly detection AUROC of 92.6% without using pre-trained models\nand memory banks. In addition, EasyNet is faster than existing methods, with a\nhigh frame rate of 94.55 FPS on a Tesla V100 GPU.\n","authors":["Ruitao Chen","Guoyang Xie","Jiaqi Liu","Jinbao Wang","Ziqi Luo","Jinfan Wang","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.13925v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00216v1","updated":"2023-09-01T02:27:05Z","published":"2023-09-01T02:27:05Z","title":"Human-Inspired Facial Sketch Synthesis with Dynamic Adaptation","summary":" Facial sketch synthesis (FSS) aims to generate a vivid sketch portrait from a\ngiven facial photo. Existing FSS methods merely rely on 2D representations of\nfacial semantic or appearance. However, professional human artists usually use\noutlines or shadings to covey 3D geometry. Thus facial 3D geometry (e.g. depth\nmap) is extremely important for FSS. Besides, different artists may use diverse\ndrawing techniques and create multiple styles of sketches; but the style is\nglobally consistent in a sketch. Inspired by such observations, in this paper,\nwe propose a novel Human-Inspired Dynamic Adaptation (HIDA) method. Specially,\nwe propose to dynamically modulate neuron activations based on a joint\nconsideration of both facial 3D geometry and 2D appearance, as well as globally\nconsistent style control. Besides, we use deformable convolutions at\ncoarse-scales to align deep features, for generating abstract and distinct\noutlines. Experiments show that HIDA can generate high-quality sketches in\nmultiple styles, and significantly outperforms previous methods, over a large\nrange of challenging faces. Besides, HIDA allows precise style control of the\nsynthesized sketch, and generalizes well to natural scenes and other artistic\nstyles. Our code and results have been released online at:\nhttps://github.com/AiArt-HDU/HIDA.\n","authors":["Fei Gao","Yifan Zhu","Chang Jiang","Nannan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00216v1.pdf","comment":"To appear on ICCV'23"},{"id":"http://arxiv.org/abs/2303.13121v2","updated":"2023-09-01T02:26:18Z","published":"2023-03-23T09:23:11Z","title":"DetOFA: Efficient Training of Once-for-All Networks for Object Detection\n by Using Pre-trained Supernet and Path Filter","summary":" We address the challenge of training a large supernet for the object\ndetection task, using a relatively small amount of training data. Specifically,\nwe propose an efficient supernet-based neural architecture search (NAS) method\nthat uses transfer learning and search space pruning. First, the supernet is\npre-trained on a classification task, for which large datasets are available.\nSecond, the search space defined by the supernet is pruned by removing\ncandidate models that are predicted to perform poorly. To effectively remove\nthe candidates over a wide range of resource constraints, we particularly\ndesign a performance predictor, called path filter, which can accurately\npredict the relative performance of the models that satisfy similar resource\nconstraints. Hence, supernet training is more focused on the best-performing\ncandidates. Our path filter handles prediction for paths with different\nresource budgets. Compared to once-for-all, our proposed method reduces the\ncomputational cost of the optimal network architecture by 30% and 63%, while\nyielding better accuracy-floating point operations Pareto front (0.85 and 0.45\npoints of improvement on average precision for Pascal VOC and COCO,\nrespectively).\n","authors":["Yuiko Sakuma","Masato Ishii","Takuya Narihira"],"pdf_url":"https://arxiv.org/pdf/2303.13121v2.pdf","comment":"Accepted to ICCV workshop 2023"},{"id":"http://arxiv.org/abs/2309.00215v1","updated":"2023-09-01T02:19:41Z","published":"2023-09-01T02:19:41Z","title":"Towards Addressing the Misalignment of Object Proposal Evaluation for\n Vision-Language Tasks via Semantic Grounding","summary":" Object proposal generation serves as a standard pre-processing step in\nVision-Language (VL) tasks (image captioning, visual question answering, etc.).\nThe performance of object proposals generated for VL tasks is currently\nevaluated across all available annotations, a protocol that we show is\nmisaligned - higher scores do not necessarily correspond to improved\nperformance on downstream VL tasks. Our work serves as a study of this\nphenomenon and explores the effectiveness of semantic grounding to mitigate its\neffects. To this end, we propose evaluating object proposals against only a\nsubset of available annotations, selected by thresholding an annotation\nimportance score. Importance of object annotations to VL tasks is quantified by\nextracting relevant semantic information from text describing the image. We\nshow that our method is consistent and demonstrates greatly improved alignment\nwith annotations selected by image captioning metrics and human annotation when\ncompared against existing techniques. Lastly, we compare current detectors used\nin the Scene Graph Generation (SGG) benchmark as a use case, which serves as an\nexample of when traditional object proposal evaluation techniques are\nmisaligned.\n","authors":["Joshua Feinglass","Yezhou Yang"],"pdf_url":"https://arxiv.org/pdf/2309.00215v1.pdf","comment":"Accepted to WACV 2024 (Round 1)"},{"id":"http://arxiv.org/abs/2308.11070v2","updated":"2023-09-01T01:55:14Z","published":"2023-08-21T22:31:54Z","title":"Temporal-Distributed Backdoor Attack Against Video Based Action\n Recognition","summary":" Deep neural networks (DNNs) have achieved tremendous success in various\napplications including video action recognition, yet remain vulnerable to\nbackdoor attacks (Trojans). The backdoor-compromised model will mis-classify to\nthe target class chosen by the attacker when a test instance (from a non-target\nclass) is embedded with a specific trigger, while maintaining high accuracy on\nattack-free instances. Although there are extensive studies on backdoor attacks\nagainst image data, the susceptibility of video-based systems under backdoor\nattacks remains largely unexplored. Current studies are direct extensions of\napproaches proposed for image data, e.g., the triggers are independently\nembedded within the frames, which tend to be detectable by existing defenses.\nIn this paper, we introduce a simple yet effective backdoor attack against\nvideo data. Our proposed attack, adding perturbations in a transformed domain,\nplants an imperceptible, temporally distributed trigger across the video\nframes, and is shown to be resilient to existing defensive strategies. The\neffectiveness of the proposed attack is demonstrated by extensive experiments\nwith various well-known models on two video recognition benchmarks, UCF101 and\nHMDB51, and a sign language recognition benchmark, Greek Sign Language (GSL)\ndataset. We delve into the impact of several influential factors on our\nproposed attack and identify an intriguing effect termed \"collateral damage\"\nthrough extensive studies.\n","authors":["Xi Li","Songhe Wang","Ruiquan Huang","Mahanth Gowda","George Kesidis"],"pdf_url":"https://arxiv.org/pdf/2308.11070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00206v1","updated":"2023-09-01T01:48:21Z","published":"2023-09-01T01:48:21Z","title":"Gap and Overlap Detection in Automated Fiber Placement","summary":" The identification and correction of manufacturing defects, particularly gaps\nand overlaps, are crucial for ensuring high-quality composite parts produced\nthrough Automated Fiber Placement (AFP). These imperfections are the most\ncommonly observed issues that can significantly impact the overall quality of\nthe composite parts. Manual inspection is both time-consuming and\nlabor-intensive, making it an inefficient approach. To overcome this challenge,\nthe implementation of an automated defect detection system serves as the\noptimal solution. In this paper, we introduce a novel method that uses an\nOptical Coherence Tomography (OCT) sensor and computer vision techniques to\ndetect and locate gaps and overlaps in composite parts. Our approach involves\ngenerating a depth map image of the composite surface that highlights the\nelevation of composite tapes (or tows) on the surface. By detecting the\nboundaries of each tow, our algorithm can compare consecutive tows and identify\ngaps or overlaps that may exist between them. Any gaps or overlaps exceeding a\npredefined tolerance threshold are considered manufacturing defects. To\nevaluate the performance of our approach, we compare the detected defects with\nthe ground truth annotated by experts. The results demonstrate a high level of\naccuracy and efficiency in gap and overlap segmentation.\n","authors":["Assef Ghamisi","Homayoun Najjaran"],"pdf_url":"https://arxiv.org/pdf/2309.00206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00199v1","updated":"2023-09-01T01:40:39Z","published":"2023-09-01T01:40:39Z","title":"Diffusion Model with Clustering-based Conditioning for Food Image\n Generation","summary":" Image-based dietary assessment serves as an efficient and accurate solution\nfor recording and analyzing nutrition intake using eating occasion images as\ninput. Deep learning-based techniques are commonly used to perform image\nanalysis such as food classification, segmentation, and portion size\nestimation, which rely on large amounts of food images with annotations for\ntraining. However, such data dependency poses significant barriers to\nreal-world applications, because acquiring a substantial, diverse, and balanced\nset of food images can be challenging. One potential solution is to use\nsynthetic food images for data augmentation. Although existing work has\nexplored the use of generative adversarial networks (GAN) based structures for\ngeneration, the quality of synthetic food images still remains subpar. In\naddition, while diffusion-based generative models have shown promising results\nfor general image generation tasks, the generation of food images can be\nchallenging due to the substantial intra-class variance. In this paper, we\ninvestigate the generation of synthetic food images based on the conditional\ndiffusion model and propose an effective clustering-based training framework,\nnamed ClusDiff, for generating high-quality and representative food images. The\nproposed method is evaluated on the Food-101 dataset and shows improved\nperformance when compared with existing image generation works. We also\ndemonstrate that the synthetic food images generated by ClusDiff can help\naddress the severe class imbalance issue in long-tailed food classification\nusing the VFN-LT dataset.\n","authors":["Yue Han","Jiangpeng He","Mridul Gupta","Edward J. Delp","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.00199v1.pdf","comment":"Accepted for 31st ACM International Conference on Multimedia: 8th\n International Workshop on Multimedia Assisted Dietary Management (MADiMa\n 2023)"},{"id":"http://arxiv.org/abs/2309.00188v1","updated":"2023-09-01T01:01:13Z","published":"2023-09-01T01:01:13Z","title":"DARC: Distribution-Aware Re-Coloring Model for Generalizable Nucleus\n Segmentation","summary":" Nucleus segmentation is usually the first step in pathological image analysis\ntasks. Generalizable nucleus segmentation refers to the problem of training a\nsegmentation model that is robust to domain gaps between the source and target\ndomains. The domain gaps are usually believed to be caused by the varied image\nacquisition conditions, e.g., different scanners, tissues, or staining\nprotocols. In this paper, we argue that domain gaps can also be caused by\ndifferent foreground (nucleus)-background ratios, as this ratio significantly\naffects feature statistics that are critical to normalization layers. We\npropose a Distribution-Aware Re-Coloring (DARC) model that handles the above\nchallenges from two perspectives. First, we introduce a re-coloring method that\nrelieves dramatic image color variations between different domains. Second, we\npropose a new instance normalization method that is robust to the variation in\nforeground-background ratios. We evaluate the proposed methods on two H$\\&$E\nstained image datasets, named CoNSeP and CPM17, and two IHC stained image\ndatasets, called DeepLIIF and BC-DeepLIIF. Extensive experimental results\njustify the effectiveness of our proposed DARC model. Codes are available at\n\\url{https://github.com/csccsccsccsc/DARC\n","authors":["Shengcong Chen","Changxing Ding","Dacheng Tao","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2309.00188v1.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.00187v1","updated":"2023-09-01T00:59:43Z","published":"2023-09-01T00:59:43Z","title":"Vision-aided nonlinear control framework for shake table tests","summary":" The structural response under the earthquake excitations can be simulated by\nscaled-down model shake table tests or full-scale model shake table tests. In\nthis paper, adaptive control theory is used as a nonlinear shake table control\nalgorithm which considers the inherent nonlinearity of the shake table system\nand the Control-Structural Interaction (CSI) effect that the linear controller\ncannot consider, such as the Proportional-Integral-Derivative (PID) controller.\nThe mass of the specimen can be assumed as an unknown variation and the unknown\nparameter will be replaced by an estimated value in the proposed control\nframework. The signal generated by the control law of the adaptive control\nmethod will be implemented by a loop-shaping controller. To verify the\nstability and feasibility of the proposed control framework, a simulation of a\nbare shake table and experiments with a bare shake table with a two-story frame\nwere carried out. This study randomly selects Earthquake recordings from the\nPacific Earthquake Engineering Research Center (PEER) database. The simulation\nand experimental results show that the proposed control framework can be\neffectively used in shake table control.\n","authors":["Zhongwei Chen","T. Y. Yang","Yifei Xiao","Xiao Pan","Wanyan Yang"],"pdf_url":"https://arxiv.org/pdf/2309.00187v1.pdf","comment":"10 pages, 7 figures, accepted in the Canadian Conference - Pacific\n Conference on Earthquake Engineering 2023, Vancouver, British Columbia"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.00550v1","updated":"2023-09-01T15:59:14Z","published":"2023-09-01T15:59:14Z","title":"NeMig -- A Bilingual News Collection and Knowledge Graph about Migration","summary":" News recommendation plays a critical role in shaping the public's worldviews\nthrough the way in which it filters and disseminates information about\ndifferent topics. Given the crucial impact that media plays in opinion\nformation, especially for sensitive topics, understanding the effects of\npersonalized recommendation beyond accuracy has become essential in today's\ndigital society. In this work, we present NeMig, a bilingual news collection on\nthe topic of migration, and corresponding rich user data. In comparison to\nexisting news recommendation datasets, which comprise a large variety of\nmonolingual news, NeMig covers articles on a single controversial topic,\npublished in both Germany and the US. We annotate the sentiment polarization of\nthe articles and the political leanings of the media outlets, in addition to\nextracting subtopics and named entities disambiguated through Wikidata. These\nfeatures can be used to analyze the effects of algorithmic news curation beyond\naccuracy-based performance, such as recommender biases and the creation of\nfilter bubbles. We construct domain-specific knowledge graphs from the news\ntext and metadata, thus encoding knowledge-level connections between articles.\nImportantly, while existing datasets include only click behavior, we collect\nuser socio-demographic and political information in addition to explicit click\nfeedback. We demonstrate the utility of NeMig through experiments on the tasks\nof news recommenders benchmarking, analysis of biases in recommenders, and news\ntrends analysis. NeMig aims to provide a useful resource for the news\nrecommendation community and to foster interdisciplinary research into the\nmultidimensional effects of algorithmic news curation.\n","authors":["Andreea Iana","Mehwish Alam","Alexander Grote","Nevena Nikolajevic","Katharina Ludwig","Philipp Müller","Christof Weinhardt","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2309.00550v1.pdf","comment":"Accepted at the 11th International Workshop on News Recommendation\n and Analytics (INRA 2023) in conjunction with ACM RecSys 2023"},{"id":"http://arxiv.org/abs/2308.16505v2","updated":"2023-09-01T15:40:16Z","published":"2023-08-31T07:36:44Z","title":"Recommender AI Agent: Integrating Large Language Models for Interactive\n Recommendations","summary":" Recommender models excel at providing domain-specific item recommendations by\nleveraging extensive user behavior data. Despite their ability to act as\nlightweight domain experts, they struggle to perform versatile tasks such as\nproviding explanations and engaging in conversations. On the other hand, large\nlanguage models (LLMs) represent a significant step towards artificial general\nintelligence, showcasing remarkable capabilities in instruction comprehension,\ncommonsense reasoning, and human interaction. However, LLMs lack the knowledge\nof domain-specific item catalogs and behavioral patterns, particularly in areas\nthat diverge from general world knowledge, such as online e-commerce.\nFinetuning LLMs for each domain is neither economic nor efficient.\n In this paper, we bridge the gap between recommender models and LLMs,\ncombining their respective strengths to create a versatile and interactive\nrecommender system. We introduce an efficient framework called InteRecAgent,\nwhich employs LLMs as the brain and recommender models as tools. We first\noutline a minimal set of essential tools required to transform LLMs into\nInteRecAgent. We then propose an efficient workflow within InteRecAgent for\ntask execution, incorporating key components such as a memory bus, dynamic\ndemonstration-augmented task planning, and reflection. InteRecAgent enables\ntraditional recommender systems, such as those ID-based matrix factorization\nmodels, to become interactive systems with a natural language interface through\nthe integration of LLMs. Experimental results on several public datasets show\nthat InteRecAgent achieves satisfying performance as a conversational\nrecommender system, outperforming general-purpose LLMs.\n","authors":["Xu Huang","Jianxun Lian","Yuxuan Lei","Jing Yao","Defu Lian","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.16505v2.pdf","comment":"16 pages, 15 figures, 4 tables"},{"id":"http://arxiv.org/abs/2304.06182v3","updated":"2023-09-01T14:54:57Z","published":"2023-04-12T22:46:52Z","title":"GNNUERS: Fairness Explanation in GNNs for Recommendation via\n Counterfactual Reasoning","summary":" Nowadays, research into personalization has been focusing on explainability\nand fairness. Several approaches proposed in recent works are able to explain\nindividual recommendations in a post-hoc manner or by explanation paths.\nHowever, explainability techniques applied to unfairness in recommendation have\nbeen limited to finding user/item features mostly related to biased\nrecommendations. In this paper, we devised a novel algorithm that leverages\ncounterfactuality methods to discover user unfairness explanations in the form\nof user-item interactions. In our counterfactual framework, interactions are\nrepresented as edges in a bipartite graph, with users and items as nodes. Our\nBipartite Graph Explainer perturbs the topological structure to find an altered\nversion (counterfactual explanation) that minimizes the disparity in utility\nbetween the protected and unprotected demographic groups. Experiments on four\nreal-world graphs coming from various domains showed that our method can\nsystematically explain user unfairness on three state-of-the-art GNN-based\nrecommendation models. Moreover, an empirical evaluation of the perturbed\nnetwork uncovered relevant patterns that justify the nature of the unfairness\ndiscovered by the generated explanations. The source code and the preprocessed\ndata sets are available at https://github.com/jackmedda/RS-BGExplainer.\n","authors":["Giacomo Medda","Francesco Fabbri","Mirko Marras","Ludovico Boratto","Gianni Fenu"],"pdf_url":"https://arxiv.org/pdf/2304.06182v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00472v1","updated":"2023-09-01T14:11:19Z","published":"2023-09-01T14:11:19Z","title":"General and Practical Tuning Method for Off-the-Shelf Graph-Based Index:\n SISAP Indexing Challenge Report by Team UTokyo","summary":" Despite the efficacy of graph-based algorithms for Approximate Nearest\nNeighbor (ANN) searches, the optimal tuning of such systems remains unclear.\nThis study introduces a method to tune the performance of off-the-shelf\ngraph-based indexes, focusing on the dimension of vectors, database size, and\nentry points of graph traversal. We utilize a black-box optimization algorithm\nto perform integrated tuning to meet the required levels of recall and Queries\nPer Second (QPS). We applied our approach to Task A of the SISAP 2023 Indexing\nChallenge and got second place in the 10M and 30M tracks. It improves\nperformance substantially compared to brute force methods. This research offers\na universally applicable tuning method for graph-based indexes, extending\nbeyond the specific conditions of the competition to broader uses.\n","authors":["Yutaro Oguri","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2309.00472v1.pdf","comment":"Accepted paper on 2nd place solution of SISAP 2023 Indexing Challenge\n Task A"},{"id":"http://arxiv.org/abs/2309.00356v1","updated":"2023-09-01T09:22:33Z","published":"2023-09-01T09:22:33Z","title":"Explainable Active Learning for Preference Elicitation","summary":" Gaining insights into the preferences of new users and subsequently\npersonalizing recommendations necessitate managing user interactions\nintelligently, namely, posing pertinent questions to elicit valuable\ninformation effectively. In this study, our focus is on a specific scenario of\nthe cold-start problem, where the recommendation system lacks adequate user\npresence or access to other users' data is restricted, obstructing employing\nuser profiling methods utilizing existing data in the system. We employ Active\nLearning (AL) to solve the addressed problem with the objective of maximizing\ninformation acquisition with minimal user effort. AL operates for selecting\ninformative data from a large unlabeled set to inquire an oracle to label them\nand eventually updating a machine learning (ML) model. We operate AL in an\nintegrated process of unsupervised, semi-supervised, and supervised ML within\nan explanatory preference elicitation process. It harvests user feedback (given\nfor the system's explanations on the presented items) over informative samples\nto update an underlying ML model estimating user preferences. The designed user\ninteraction facilitates personalizing the system by incorporating user feedback\ninto the ML model and also enhances user trust by refining the system's\nexplanations on recommendations. We implement the proposed preference\nelicitation methodology for food recommendation. We conducted human experiments\nto assess its efficacy in the short term and also experimented with several AL\nstrategies over synthetic user profiles that we created for two food datasets,\naiming for long-term performance analysis. The experimental results demonstrate\nthe efficiency of the proposed preference elicitation with limited user-labeled\ndata while also enhancing user trust through accurate explanations.\n","authors":["Furkan Cantürk","Reyhan Aydoğan"],"pdf_url":"https://arxiv.org/pdf/2309.00356v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2309.00347v1","updated":"2023-09-01T09:08:21Z","published":"2023-09-01T09:08:21Z","title":"Towards Contrastive Learning in Music Video Domain","summary":" Contrastive learning is a powerful way of learning multimodal representations\nacross various domains such as image-caption retrieval and audio-visual\nrepresentation learning. In this work, we investigate if these findings\ngeneralize to the domain of music videos. Specifically, we create a dual\nen-coder for the audio and video modalities and train it using a bidirectional\ncontrastive loss. For the experiments, we use an industry dataset containing\n550 000 music videos as well as the public Million Song Dataset, and evaluate\nthe quality of learned representations on the downstream tasks of music tagging\nand genre classification. Our results indicate that pre-trained networks\nwithout contrastive fine-tuning outperform our contrastive learning approach\nwhen evaluated on both tasks. To gain a better understanding of the reasons\ncontrastive learning was not successful for music videos, we perform a\nqualitative analysis of the learned representations, revealing why contrastive\nlearning might have difficulties uniting embeddings from two modalities. Based\non these findings, we outline possible directions for future work. To\nfacilitate the reproducibility of our results, we share our code and the\npre-trained model.\n","authors":["Karel Veldkamp","Mariya Hendriksen","Zoltán Szlávik","Alexander Keijser"],"pdf_url":"https://arxiv.org/pdf/2309.00347v1.pdf","comment":"6 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.11288v2","updated":"2023-09-01T07:17:54Z","published":"2023-08-22T08:57:44Z","title":"Test Time Embedding Normalization for Popularity Bias Mitigation","summary":" Popularity bias is a widespread problem in the field of recommender systems,\nwhere popular items tend to dominate recommendation results. In this work, we\npropose 'Test Time Embedding Normalization' as a simple yet effective strategy\nfor mitigating popularity bias, which surpasses the performance of the previous\nmitigation approaches by a significant margin. Our approach utilizes the\nnormalized item embedding during the inference stage to control the influence\nof embedding magnitude, which is highly correlated with item popularity.\nThrough extensive experiments, we show that our method combined with the\nsampled softmax loss effectively reduces popularity bias compare to previous\napproaches for bias mitigation. We further investigate the relationship between\nuser and item embeddings and find that the angular similarity between\nembeddings distinguishes preferable and non-preferable items regardless of\ntheir popularity. The analysis explains the mechanism behind the success of our\napproach in eliminating the impact of popularity bias. Our code is available at\nhttps://github.com/ml-postech/TTEN.\n","authors":["Dain Kim","Jinhyeok Park","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11288v2.pdf","comment":"5 pages, CIKM 2023"},{"id":"http://arxiv.org/abs/2010.01600v3","updated":"2023-09-01T00:03:47Z","published":"2020-10-04T15:20:05Z","title":"Sparseness-constrained Nonnegative Tensor Factorization for Detecting\n Topics at Different Time Scales","summary":" Temporal data (such as news articles or Twitter feeds) often consists of a\nmixture of long-lasting trends and popular but short-lasting topics of\ninterest. A truly successful topic modeling strategy should be able to detect\nboth types of topics and clearly locate them in time. In this paper, we first\nshow that nonnegative CANDECOMP/PARAFAC decomposition (NCPD) is able to\ndiscover topics of variable persistence automatically. Then, we propose\nsparseness-constrained NCPD (S-NCPD) and its online variant in order to\nactively control the length of the learned topics effectively and efficiently.\nFurther, we propose quantitative ways to measure the topic length and\ndemonstrate the ability of S-NCPD (as well as its online variant) to discover\nshort and long-lasting temporal topics in a controlled manner in semi-synthetic\nand real-world data including news headlines. We also demonstrate that the\nonline variant of S-NCPD reduces the reconstruction error more rapidly than\nS-NCPD.\n","authors":["Lara Kassab","Alona Kryshchenko","Hanbaek Lyu","Denali Molitor","Deanna Needell","Elizaveta Rebrova","Jiahong Yuan"],"pdf_url":"https://arxiv.org/pdf/2010.01600v3.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.00615v1","updated":"2023-09-01T17:59:47Z","published":"2023-09-01T17:59:47Z","title":"Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D\n Understanding, Generation, and Instruction Following","summary":" We introduce Point-Bind, a 3D multi-modality model aligning point clouds with\n2D image, language, audio, and video. Guided by ImageBind, we construct a joint\nembedding space between 3D and multi-modalities, enabling many promising\napplications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D\nopen-world understanding. On top of this, we further present Point-LLM, the\nfirst 3D large language model (LLM) following 3D multi-modal instructions. By\nparameter-efficient fine-tuning techniques, Point-LLM injects the semantics of\nPoint-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction\ndata, but exhibits superior 3D and multi-modal question-answering capacity. We\nhope our work may cast a light on the community for extending 3D point clouds\nto multi-modality applications. Code is available at\nhttps://github.com/ZiyuGuo99/Point-Bind_Point-LLM.\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Yiwen Tang","Xianzheng Ma","Jiaming Han","Kexin Chen","Peng Gao","Xianzhi Li","Hongsheng Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2309.00615v1.pdf","comment":"Work in progress. Code is available at\n https://github.com/ZiyuGuo99/Point-Bind_Point-LLM"},{"id":"http://arxiv.org/abs/2309.00614v1","updated":"2023-09-01T17:59:44Z","published":"2023-09-01T17:59:44Z","title":"Baseline Defenses for Adversarial Attacks Against Aligned Language\n Models","summary":" As Large Language Models quickly become ubiquitous, their security\nvulnerabilities are critical to understand. Recent work shows that text\noptimizers can produce jailbreaking prompts that bypass moderation and\nalignment. Drawing from the rich body of work on adversarial machine learning,\nwe approach these attacks with three questions: What threat models are\npractically useful in this domain? How do baseline defense techniques perform\nin this new domain? How does LLM security differ from computer vision?\n We evaluate several baseline defense strategies against leading adversarial\nattacks on LLMs, discussing the various settings in which each is feasible and\neffective. Particularly, we look at three types of defenses: detection\n(perplexity based), input preprocessing (paraphrase and retokenization), and\nadversarial training. We discuss white-box and gray-box settings and discuss\nthe robustness-performance trade-off for each of the defenses considered.\nSurprisingly, we find much more success with filtering and preprocessing than\nwe would expect from other domains, such as vision, providing a first\nindication that the relative strengths of these defenses may be weighed\ndifferently in these domains.\n","authors":["Neel Jain","Avi Schwarzschild","Yuxin Wen","Gowthami Somepalli","John Kirchenbauer","Ping-yeh Chiang","Micah Goldblum","Aniruddha Saha","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2309.00614v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2309.00613v1","updated":"2023-09-01T17:59:29Z","published":"2023-09-01T17:59:29Z","title":"Iterative Multi-granular Image Editing using Diffusion Models","summary":" Recent advances in text-guided image synthesis has dramatically changed how\ncreative professionals generate artistic and aesthetically pleasing visual\nassets. To fully support such creative endeavors, the process should possess\nthe ability to: 1) iteratively edit the generations and 2) control the spatial\nreach of desired changes (global, local or anything in between). We formalize\nthis pragmatic problem setting as Iterative Multi-granular Editing. While there\nhas been substantial progress with diffusion-based models for image synthesis\nand editing, they are all one shot (i.e., no iterative editing capabilities)\nand do not naturally yield multi-granular control (i.e., covering the full\nspectrum of local-to-global edits). To overcome these drawbacks, we propose\nEMILIE: Iterative Multi-granular Image Editor. EMILIE introduces a novel latent\niteration strategy, which re-purposes a pre-trained diffusion model to\nfacilitate iterative editing. This is complemented by a gradient control\noperation for multi-granular control. We introduce a new benchmark dataset to\nevaluate our newly proposed setting. We conduct exhaustive quantitatively and\nqualitatively evaluation against recent state-of-the-art approaches adapted to\nour task, to being out the mettle of EMILIE. We hope our work would attract\nattention to this newly identified, pragmatic problem setting.\n","authors":["K J Joseph","Prateksha Udhayanan","Tripti Shukla","Aishwarya Agarwal","Srikrishna Karanam","Koustava Goswami","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2309.00613v1.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2309.00612v1","updated":"2023-09-01T17:59:06Z","published":"2023-09-01T17:59:06Z","title":"Bayesian deep learning for cosmic volumes with modified gravity","summary":" The new generation of galaxy surveys will provide unprecedented data allowing\nus to test gravity at cosmological scales. A robust cosmological analysis of\nthe large-scale structure demands exploiting the nonlinear information encoded\nin the cosmic web. Machine Learning techniques provide such tools, however, do\nnot provide a priori assessment of uncertainties. This study aims at extracting\ncosmological parameters from modified gravity (MG) simulations through deep\nneural networks endowed with uncertainty estimations. We implement Bayesian\nneural networks (BNNs) with an enriched approximate posterior distribution\nconsidering two cases: one with a single Bayesian last layer (BLL), and another\none with Bayesian layers at all levels (FullB). We train both BNNs with\nreal-space density fields and power-spectra from a suite of 2000 dark matter\nonly particle mesh $N$-body simulations including modified gravity models\nrelying on MG-PICOLA covering 256 $h^{-1}$ Mpc side cubical volumes with\n128$^3$ particles. BNNs excel in accurately predicting parameters for\n$\\Omega_m$ and $\\sigma_8$ and their respective correlation with the MG\nparameter. We find out that BNNs yield well-calibrated uncertainty estimates\novercoming the over- and under-estimation issues in traditional neural\nnetworks. We observe that the presence of MG parameter leads to a significant\ndegeneracy with $\\sigma_8$ being one of the possible explanations of the poor\nMG predictions. Ignoring MG, we obtain a deviation of the relative errors in\n$\\Omega_m$ and $\\sigma_8$ by at least $30\\%$. Moreover, we report consistent\nresults from the density field and power spectra analysis, and comparable\nresults between BLL and FullB experiments which permits us to save computing\ntime by a factor of two. This work contributes in setting the path to extract\ncosmological parameters from complete small cosmic volumes towards the highly\nnonlinear regime.\n","authors":["Jorge Enrique García-Farieta","Héctor J Hortúa","Francisco-Shu Kitaura"],"pdf_url":"https://arxiv.org/pdf/2309.00612v1.pdf","comment":"13 pages, 7 figures and 7 tables"},{"id":"http://arxiv.org/abs/2309.00608v1","updated":"2023-09-01T17:54:14Z","published":"2023-09-01T17:54:14Z","title":"Copiloting the Copilots: Fusing Large Language Models with Completion\n Engines for Automated Program Repair","summary":" During Automated Program Repair (APR), it can be challenging to synthesize\ncorrect patches for real-world systems in general-purpose programming\nlanguages. Recent Large Language Models (LLMs) have been shown to be helpful\n\"copilots\" in assisting developers with various coding tasks, and have also\nbeen directly applied for patch synthesis. However, most LLMs treat programs as\nsequences of tokens, meaning that they are ignorant of the underlying semantics\nconstraints of the target programming language. This results in plenty of\nstatically invalid generated patches, impeding the practicality of the\ntechnique. Therefore, we propose Repilot, a framework to further copilot the AI\n\"copilots\" (i.e., LLMs) by synthesizing more valid patches during the repair\nprocess. Our key insight is that many LLMs produce outputs autoregressively\n(i.e., token by token), resembling human writing programs, which can be\nsignificantly boosted and guided through a Completion Engine. Repilot\nsynergistically synthesizes a candidate patch through the interaction between\nan LLM and a Completion Engine, which 1) prunes away infeasible tokens\nsuggested by the LLM and 2) proactively completes the token based on the\nsuggestions provided by the Completion Engine. Our evaluation on a subset of\nthe widely-used Defects4j 1.2 and 2.0 datasets shows that Repilot fixes 66 and\n50 bugs, respectively, surpassing the best-performing baseline by 14 and 16\nbugs fixed. More importantly, Repilot is capable of producing more valid and\ncorrect patches than the base LLM when given the same generation budget.\n","authors":["Yuxiang Wei","Chunqiu Steven Xia","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.00608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08468v3","updated":"2023-09-01T17:37:42Z","published":"2023-02-16T18:23:22Z","title":"LEVER: Learning to Verify Language-to-Code Generation with Execution","summary":" The advent of large language models trained on code (code LLMs) has led to\nsignificant progress in language-to-code generation. State-of-the-art\napproaches in this area combine LLM decoding with sample pruning and reranking\nusing test cases or heuristics based on the execution results. However, it is\nchallenging to obtain test cases for many real-world language-to-code\napplications, and heuristics cannot well capture the semantic features of the\nexecution results, such as data type and value range, which often indicates the\ncorrectness of the program. In this work, we propose LEVER, a simple approach\nto improve language-to-code generation by learning to verify the generated\nprograms with their execution results. Specifically, we train verifiers to\ndetermine whether a program sampled from the LLMs is correct or not based on\nthe natural language input, the program itself and its execution results. The\nsampled programs are reranked by combining the verification score with the LLM\ngeneration probability, and marginalizing over programs with the same execution\nresults. On four datasets across the domains of table QA, math QA and basic\nPython programming, LEVER consistently improves over the base code LLMs(4.6% to\n10.9% with code-davinci-002) and achieves new state-of-the-art results on all\nof them.\n","authors":["Ansong Ni","Srini Iyer","Dragomir Radev","Ves Stoyanov","Wen-tau Yih","Sida I. Wang","Xi Victoria Lin"],"pdf_url":"https://arxiv.org/pdf/2302.08468v3.pdf","comment":"ICML'23; code available at https://github.com/niansong1996/lever"},{"id":"http://arxiv.org/abs/2308.09709v2","updated":"2023-09-01T17:25:21Z","published":"2023-08-18T17:58:36Z","title":"Neural-network quantum state study of the long-range antiferromagnetic\n Ising chain","summary":" We investigate quantum phase transitions in the transverse field Ising chain\nwith algebraically decaying long-range antiferromagnetic interactions by using\nthe variational Monte Carlo method with the restricted Boltzmann machine being\nemployed as a trial wave function ansatz. In the finite-size scaling analysis\nwith the order parameter and the second R\\'enyi entropy, we find that the\ncentral charge deviates from 1/2 at a small decay exponent $\\alpha_\\mathrm{LR}$\nin contrast to the critical exponents staying very close to the short-range\n(SR) Ising values regardless of $\\alpha_\\mathrm{LR}$ examined, supporting the\npreviously proposed scenario of conformal invariance breakdown. To identify the\nthreshold of the Ising universality and the conformal symmetry, we perform two\nadditional tests for the universal Binder ratio and the conformal field theory\n(CFT) description of the correlation function. It turns out that both indicate\na noticeable deviation from the SR Ising class at $\\alpha_\\mathrm{LR} < 2$.\nHowever, a closer look at the scaled correlation function for\n$\\alpha_\\mathrm{LR} \\ge 2$ shows a gradual change from the asymptotic line of\nthe CFT verified at $\\alpha_\\mathrm{LR} = 3$, providing a rough estimate of the\nthreshold being in the range of $2 \\lesssim \\alpha_\\mathrm{LR} < 3$.\n","authors":["Jicheol Kim","Dongkyu Kim","Dong-Hee Kim"],"pdf_url":"https://arxiv.org/pdf/2308.09709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00591v1","updated":"2023-09-01T17:12:43Z","published":"2023-09-01T17:12:43Z","title":"Fast and Regret Optimal Best Arm Identification: Fundamental Limits and\n Low-Complexity Algorithms","summary":" This paper considers a stochastic multi-armed bandit (MAB) problem with dual\nobjectives: (i) quick identification and commitment to the optimal arm, and\n(ii) reward maximization throughout a sequence of $T$ consecutive rounds.\nThough each objective has been individually well-studied, i.e., best arm\nidentification for (i) and regret minimization for (ii), the simultaneous\nrealization of both objectives remains an open problem, despite its practical\nimportance. This paper introduces \\emph{Regret Optimal Best Arm Identification}\n(ROBAI) which aims to achieve these dual objectives. To solve ROBAI with both\npre-determined stopping time and adaptive stopping time requirements, we\npresent the $\\mathsf{EOCP}$ algorithm and its variants respectively, which not\nonly achieve asymptotic optimal regret in both Gaussian and general bandits,\nbut also commit to the optimal arm in $\\mathcal{O}(\\log T)$ rounds with\npre-determined stopping time and $\\mathcal{O}(\\log^2 T)$ rounds with adaptive\nstopping time. We further characterize lower bounds on the commitment time\n(equivalent to sample complexity) of ROBAI, showing that $\\mathsf{EOCP}$ and\nits variants are sample optimal with pre-determined stopping time, and almost\nsample optimal with adaptive stopping time. Numerical results confirm our\ntheoretical analysis and reveal an interesting ``over-exploration'' phenomenon\ncarried by classic $\\mathsf{UCB}$ algorithms, such that $\\mathsf{EOCP}$ has\nsmaller regret even though it stops exploration much earlier than\n$\\mathsf{UCB}$ ($\\mathcal{O}(\\log T)$ versus $\\mathcal{O}(T)$), which suggests\nover-exploration is unnecessary and potentially harmful to system performance.\n","authors":["Qining Zhang","Lei Ying"],"pdf_url":"https://arxiv.org/pdf/2309.00591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10248v2","updated":"2023-09-01T17:07:29Z","published":"2023-08-20T12:21:05Z","title":"Activation Addition: Steering Language Models Without Optimization","summary":" Reliably controlling the behavior of large language models is a pressing open\nproblem. Existing methods include supervised finetuning, reinforcement learning\nfrom human feedback, prompt engineering, and guided decoding. We instead\ninvestigate activation engineering: modifying activations at inference time to\npredictably alter model behavior. In particular, we bias the forward pass with\nan added 'steering vector' implicitly specified through natural language.\n Unlike past work which learned these steering vectors, our Activation\nAddition (ActAdd) method computes them by taking the activation differences\nthat result from pairs of prompts. We demonstrate ActAdd on GPT-2 on\nOpenWebText and ConceptNet. Our inference-time approach yields control over\nhigh-level properties of output and preserves off-target model performance. It\ninvolves far less compute and implementation effort than finetuning, allows\nusers to provide natural language specifications, and its overhead scales\nnaturally with model size.\n","authors":["Alexander Matt Turner","Lisa Thiergart","David Udell","Gavin Leech","Ulisse Mini","Monte MacDiarmid"],"pdf_url":"https://arxiv.org/pdf/2308.10248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00585v1","updated":"2023-09-01T17:01:04Z","published":"2023-09-01T17:01:04Z","title":"PolyGET: Accelerating Polymer Simulations by Accurate and Generalizable\n Forcefield with Equivariant Transformer","summary":" Polymer simulation with both accuracy and efficiency is a challenging task.\nMachine learning (ML) forcefields have been developed to achieve both the\naccuracy of ab initio methods and the efficiency of empirical force fields.\nHowever, existing ML force fields are usually limited to single-molecule\nsettings, and their simulations are not robust enough. In this paper, we\npresent PolyGET, a new framework for Polymer Forcefields with Generalizable\nEquivariant Transformers. PolyGET is designed to capture complex quantum\ninteractions between atoms and generalize across various polymer families,\nusing a deep learning model called Equivariant Transformers. We propose a new\ntraining paradigm that focuses exclusively on optimizing forces, which is\ndifferent from existing methods that jointly optimize forces and energy. This\nsimple force-centric objective function avoids competing objectives between\nenergy and forces, thereby allowing for learning a unified forcefield ML model\nover different polymer families. We evaluated PolyGET on a large-scale dataset\nof 24 distinct polymer types and demonstrated state-of-the-art performance in\nforce accuracy and robust MD simulations. Furthermore, PolyGET can simulate\nlarge polymers with high fidelity to the reference ab initio DFT method while\nbeing able to generalize to unseen polymers.\n","authors":["Rui Feng","Huan Tran","Aubrey Toland","Binghong Chen","Qi Zhu","Rampi Ramprasad","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.00585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00584v1","updated":"2023-09-01T17:00:48Z","published":"2023-09-01T17:00:48Z","title":"Laminar: A New Serverless Stream-based Framework with Semantic Code\n Search and Code Completion","summary":" This paper introduces Laminar, a novel serverless framework based on\ndispel4py, a parallel stream-based dataflow library. Laminar efficiently\nmanages streaming workflows and components through a dedicated registry,\noffering a seamless serverless experience. Leveraging large lenguage models,\nLaminar enhances the framework with semantic code search, code summarization,\nand code completion. This contribution enhances serverless computing by\nsimplifying the execution of streaming computations, managing data streams more\nefficiently, and offering a valuable tool for both researchers and\npractitioners.\n","authors":["Zaynab Zahra","Zihao Li","Rosa Filgueira"],"pdf_url":"https://arxiv.org/pdf/2309.00584v1.pdf","comment":"13 pages, 10 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2309.00583v1","updated":"2023-09-01T16:59:21Z","published":"2023-09-01T16:59:21Z","title":"Geometry-Informed Neural Operator for Large-Scale 3D PDEs","summary":" We propose the geometry-informed neural operator (GINO), a highly efficient\napproach to learning the solution operator of large-scale partial differential\nequations with varying geometries. GINO uses a signed distance function and\npoint-cloud representations of the input shape and neural operators based on\ngraph and Fourier architectures to learn the solution operator. The graph\nneural operator handles irregular grids and transforms them into and from\nregular latent grids on which Fourier neural operator can be efficiently\napplied. GINO is discretization-convergent, meaning the trained model can be\napplied to arbitrary discretization of the continuous domain and it converges\nto the continuum operator as the discretization is refined. To empirically\nvalidate the performance of our method on large-scale simulation, we generate\nthe industry-standard aerodynamics dataset of 3D vehicle geometries with\nReynolds numbers as high as five million. For this large-scale 3D fluid\nsimulation, numerical methods are expensive to compute surface pressure. We\nsuccessfully trained GINO to predict the pressure on car surfaces using only\nfive hundred data points. The cost-accuracy experiments show a $26,000 \\times$\nspeed-up compared to optimized GPU-based computational fluid dynamics (CFD)\nsimulators on computing the drag coefficient. When tested on new combinations\nof geometries and boundary conditions (inlet velocities), GINO obtains a\none-fourth reduction in error rate compared to deep neural network approaches.\n","authors":["Zongyi Li","Nikola Borislavov Kovachki","Chris Choy","Boyi Li","Jean Kossaifi","Shourya Prakash Otta","Mohammad Amin Nabian","Maximilian Stadler","Christian Hundt","Kamyar Azizzadenesheli","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2309.00583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00578v1","updated":"2023-09-01T16:45:52Z","published":"2023-09-01T16:45:52Z","title":"Consistency of Lloyd's Algorithm Under Perturbations","summary":" In the context of unsupervised learning, Lloyd's algorithm is one of the most\nwidely used clustering algorithms. It has inspired a plethora of work\ninvestigating the correctness of the algorithm under various settings with\nground truth clusters. In particular, in 2016, Lu and Zhou have shown that the\nmis-clustering rate of Lloyd's algorithm on $n$ independent samples from a\nsub-Gaussian mixture is exponentially bounded after $O(\\log(n))$ iterations,\nassuming proper initialization of the algorithm. However, in many applications,\nthe true samples are unobserved and need to be learned from the data via\npre-processing pipelines such as spectral methods on appropriate data matrices.\nWe show that the mis-clustering rate of Lloyd's algorithm on perturbed samples\nfrom a sub-Gaussian mixture is also exponentially bounded after $O(\\log(n))$\niterations under the assumptions of proper initialization and that the\nperturbation is small relative to the sub-Gaussian noise. In canonical settings\nwith ground truth clusters, we derive bounds for algorithms such as\n$k$-means$++$ to find good initializations and thus leading to the correctness\nof clustering via the main result. We show the implications of the results for\npipelines measuring the statistical significance of derived clusters from data\nsuch as SigClust. We use these general results to derive implications in\nproviding theoretical guarantees on the misclustering rate for Lloyd's\nalgorithm in a host of applications, including high-dimensional time series,\nmulti-dimensional scaling, and community detection for sparse networks via\nspectral clustering.\n","authors":["Dhruv Patel","Hui Shen","Shankar Bhamidi","Yufeng Liu","Vladas Pipiras"],"pdf_url":"https://arxiv.org/pdf/2309.00578v1.pdf","comment":"Preprint version 1"},{"id":"http://arxiv.org/abs/2309.00570v1","updated":"2023-09-01T16:30:02Z","published":"2023-09-01T16:30:02Z","title":"Mechanism of feature learning in convolutional neural networks","summary":" Understanding the mechanism of how convolutional neural networks learn\nfeatures from image data is a fundamental problem in machine learning and\ncomputer vision. In this work, we identify such a mechanism. We posit the\nConvolutional Neural Feature Ansatz, which states that covariances of filters\nin any convolutional layer are proportional to the average gradient outer\nproduct (AGOP) taken with respect to patches of the input to that layer. We\npresent extensive empirical evidence for our ansatz, including identifying high\ncorrelation between covariances of filters and patch-based AGOPs for\nconvolutional layers in standard neural architectures, such as AlexNet, VGG,\nand ResNets pre-trained on ImageNet. We also provide supporting theoretical\nevidence. We then demonstrate the generality of our result by using the\npatch-based AGOP to enable deep feature learning in convolutional kernel\nmachines. We refer to the resulting algorithm as (Deep) ConvRFM and show that\nour algorithm recovers similar features to deep convolutional networks\nincluding the notable emergence of edge detectors. Moreover, we find that Deep\nConvRFM overcomes previously identified limitations of convolutional kernels,\nsuch as their inability to adapt to local signals in images and, as a result,\nleads to sizable performance improvement over fixed convolutional kernels.\n","authors":["Daniel Beaglehole","Adityanarayanan Radhakrishnan","Parthe Pandit","Mikhail Belkin"],"pdf_url":"https://arxiv.org/pdf/2309.00570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00569v1","updated":"2023-09-01T16:26:42Z","published":"2023-09-01T16:26:42Z","title":"Amyloid-Beta Axial Plane PET Synthesis from Structural MRI: An Image\n Translation Approach for Screening Alzheimer's Disease","summary":" In this work, an image translation model is implemented to produce synthetic\namyloid-beta PET images from structural MRI that are quantitatively accurate.\nImage pairs of amyloid-beta PET and structural MRI were used to train the\nmodel. We found that the synthetic PET images could be produced with a high\ndegree of similarity to truth in terms of shape, contrast and overall high SSIM\nand PSNR. This work demonstrates that performing structural to quantitative\nimage translation is feasible to enable the access amyloid-beta information\nfrom only MRI.\n","authors":["Fernando Vega","Abdoljalil Addeh","M. Ethan MacDonald"],"pdf_url":"https://arxiv.org/pdf/2309.00569v1.pdf","comment":"Abstract submitted and presented to the International Society of\n Magnetic Resonance in Medicine (ISMRM 2023), Toronto, Canada"},{"id":"http://arxiv.org/abs/2309.00564v1","updated":"2023-09-01T16:20:04Z","published":"2023-09-01T16:20:04Z","title":"Interpretation of High-Dimensional Linear Regression: Effects of\n Nullspace and Regularization Demonstrated on Battery Data","summary":" High-dimensional linear regression is important in many scientific fields.\nThis article considers discrete measured data of underlying smooth latent\nprocesses, as is often obtained from chemical or biological systems.\nInterpretation in high dimensions is challenging because the nullspace and its\ninterplay with regularization shapes regression coefficients. The data's\nnullspace contains all coefficients that satisfy $\\mathbf{Xw}=\\mathbf{0}$, thus\nallowing very different coefficients to yield identical predictions. We\ndeveloped an optimization formulation to compare regression coefficients and\ncoefficients obtained by physical engineering knowledge to understand which\npart of the coefficient differences are close to the nullspace. This nullspace\nmethod is tested on a synthetic example and lithium-ion battery data. The case\nstudies show that regularization and z-scoring are design choices that, if\nchosen corresponding to prior physical knowledge, lead to interpretable\nregression results. Otherwise, the combination of the nullspace and\nregularization hinders interpretability and can make it impossible to obtain\nregression coefficients close to the true coefficients when there is a true\nunderlying linear model. Furthermore, we demonstrate that regression methods\nthat do not produce coefficients orthogonal to the nullspace, such as fused\nlasso, can improve interpretability. In conclusion, the insights gained from\nthe nullspace perspective help to make informed design choices for building\nregression models on high-dimensional data and reasoning about potential\nunderlying linear models, which are important for system optimization and\nimproving scientific understanding.\n","authors":["Joachim Schaeffer","Eric Lenz","William C. Chueh","Martin Z. Bazant","Rolf Findeisen","Richard D. Braatz"],"pdf_url":"https://arxiv.org/pdf/2309.00564v1.pdf","comment":"Manuscript: 14 pages, 7 figures; Supplementary Information: 4 pages,\n 2 figures; Code available: https://github.com/JoachimSchaeffer/HDRegAnalytics"},{"id":"http://arxiv.org/abs/2309.00557v1","updated":"2023-09-01T16:08:00Z","published":"2023-09-01T16:08:00Z","title":"Interactive and Concentrated Differential Privacy for Bandits","summary":" Bandits play a crucial role in interactive learning schemes and modern\nrecommender systems. However, these systems often rely on sensitive user data,\nmaking privacy a critical concern. This paper investigates privacy in bandits\nwith a trusted centralized decision-maker through the lens of interactive\nDifferential Privacy (DP). While bandits under pure $\\epsilon$-global DP have\nbeen well-studied, we contribute to the understanding of bandits under zero\nConcentrated DP (zCDP). We provide minimax and problem-dependent lower bounds\non regret for finite-armed and linear bandits, which quantify the cost of\n$\\rho$-global zCDP in these settings. These lower bounds reveal two hardness\nregimes based on the privacy budget $\\rho$ and suggest that $\\rho$-global zCDP\nincurs less regret than pure $\\epsilon$-global DP. We propose two $\\rho$-global\nzCDP bandit algorithms, AdaC-UCB and AdaC-GOPE, for finite-armed and linear\nbandits respectively. Both algorithms use a common recipe of Gaussian mechanism\nand adaptive episodes. We analyze the regret of these algorithms to show that\nAdaC-UCB achieves the problem-dependent regret lower bound up to multiplicative\nconstants, while AdaC-GOPE achieves the minimax regret lower bound up to\npoly-logarithmic factors. Finally, we provide experimental validation of our\ntheoretical results under different settings.\n","authors":["Achraf Azize","Debabrota Basu"],"pdf_url":"https://arxiv.org/pdf/2309.00557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00543v1","updated":"2023-09-01T15:52:32Z","published":"2023-09-01T15:52:32Z","title":"Curating Naturally Adversarial Datasets for Trustworthy AI in Healthcare","summary":" Deep learning models have shown promising predictive accuracy for time-series\nhealthcare applications. However, ensuring the robustness of these models is\nvital for building trustworthy AI systems. Existing research predominantly\nfocuses on robustness to synthetic adversarial examples, crafted by adding\nimperceptible perturbations to clean input data. However, these synthetic\nadversarial examples do not accurately reflect the most challenging real-world\nscenarios, especially in the context of healthcare data. Consequently,\nrobustness to synthetic adversarial examples may not necessarily translate to\nrobustness against naturally occurring adversarial examples, which is highly\ndesirable for trustworthy AI. We propose a method to curate datasets comprised\nof natural adversarial examples to evaluate model robustness. The method relies\non probabilistic labels obtained from automated weakly-supervised labeling that\ncombines noisy and cheap-to-obtain labeling heuristics. Based on these labels,\nour method adversarially orders the input data and uses this ordering to\nconstruct a sequence of increasingly adversarial datasets. Our evaluation on\nsix medical case studies and three non-medical case studies demonstrates the\nefficacy and statistical validity of our approach to generating naturally\nadversarial datasets\n","authors":["Sydney Pugh","Ivan Ruchkin","Insup Lee","James Weimer"],"pdf_url":"https://arxiv.org/pdf/2309.00543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00530v1","updated":"2023-09-01T15:31:26Z","published":"2023-09-01T15:31:26Z","title":"Adaptive function approximation based on the Discrete Cosine Transform\n (DCT)","summary":" This paper studies the cosine as basis function for the approximation of\nunivariate and continuous functions without memory. This work studies a\nsupervised learning to obtain the approximation coefficients, instead of using\nthe Discrete Cosine Transform (DCT). Due to the finite dynamics and\northogonality of the cosine basis functions, simple gradient algorithms, such\nas the Normalized Least Mean Squares (NLMS), can benefit from it and present a\ncontrolled and predictable convergence time and error misadjustment. Due to its\nsimplicity, the proposed technique ranks as the best in terms of learning\nquality versus complexity, and it is presented as an attractive technique to be\nused in more complex supervised learning systems. Simulations illustrate the\nperformance of the approach. This paper celebrates the 50th anniversary of the\npublication of the DCT by Nasir Ahmed in 1973.\n","authors":["Ana I. Pérez-Neira","Marc Martinez-Gost","Miguel Ángel Lagunas"],"pdf_url":"https://arxiv.org/pdf/2309.00530v1.pdf","comment":"Accepted paper in 26th International Conference on Circuits, Systems,\n Communications and Computers (CSCC)"},{"id":"http://arxiv.org/abs/2205.01931v3","updated":"2023-09-01T15:26:29Z","published":"2022-05-04T08:06:55Z","title":"Mapping the landscape of histomorphological cancer phenotypes using\n self-supervised learning on unlabeled, unannotated pathology slides","summary":" Definitive cancer diagnosis and management depend upon the extraction of\ninformation from microscopy images by pathologists. These images contain\ncomplex information requiring time-consuming expert human interpretation that\nis prone to human bias. Supervised deep learning approaches have proven\npowerful for classification tasks, but they are inherently limited by the cost\nand quality of annotations used for training these models. To address this\nlimitation of supervised methods, we developed Histomorphological Phenotype\nLearning (HPL), a fully blue{self-}supervised methodology that requires no\nexpert labels or annotations and operates via the automatic discovery of\ndiscriminatory image features in small image tiles. Tiles are grouped into\nmorphologically similar clusters which constitute a library of\nhistomorphological phenotypes, revealing trajectories from benign to malignant\ntissue via inflammatory and reactive phenotypes. These clusters have distinct\nfeatures which can be identified using orthogonal methods, linking histologic,\nmolecular and clinical phenotypes. Applied to lung cancer tissues, we show that\nthey align closely with patient survival, with histopathologically recognised\ntumor types and growth patterns, and with transcriptomic measures of\nimmunophenotype. We then demonstrate that these properties are maintained in a\nmulti-cancer study. These results show the clusters represent recurrent host\nresponses and modes of tumor growth emerging under natural selection. Code,\npre-trained models, learned embeddings, and documentation are available to the\ncommunity at\nhttps://github.com/AdalbertoCq/Histomorphological-Phenotype-Learning\n","authors":["Adalberto Claudio Quiros","Nicolas Coudray","Anna Yeaton","Xinyu Yang","Bojing Liu","Hortense Le","Luis Chiriboga","Afreen Karimkhan","Navneet Narula","David A. Moore","Christopher Y. Park","Harvey Pass","Andre L. Moreira","John Le Quesne","Aristotelis Tsirigos","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2205.01931v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00520v1","updated":"2023-09-01T15:18:05Z","published":"2023-09-01T15:18:05Z","title":"Online Distributed Learning over Random Networks","summary":" The recent deployment of multi-agent systems in a wide range of scenarios has\nenabled the solution of learning problems in a distributed fashion. In this\ncontext, agents are tasked with collecting local data and then cooperatively\ntrain a model, without directly sharing the data. While distributed learning\noffers the advantage of preserving agents' privacy, it also poses several\nchallenges in terms of designing and analyzing suitable algorithms. This work\nfocuses specifically on the following challenges motivated by practical\nimplementation: (i) online learning, where the local data change over time;\n(ii) asynchronous agent computations; (iii) unreliable and limited\ncommunications; and (iv) inexact local computations. To tackle these\nchallenges, we introduce the Distributed Operator Theoretical (DOT) version of\nthe Alternating Direction Method of Multipliers (ADMM), which we call the\nDOT-ADMM Algorithm. We prove that it converges with a linear rate for a large\nclass of convex learning problems (e.g., linear and logistic regression\nproblems) toward a bounded neighborhood of the optimal time-varying solution,\nand characterize how the neighborhood depends on~$\\text{(i)--(iv)}$. We\ncorroborate the theoretical analysis with numerical simulations comparing the\nDOT-ADMM Algorithm with other state-of-the-art algorithms, showing that only\nthe proposed algorithm exhibits robustness to (i)--(iv).\n","authors":["Nicola Bastianello","Diego Deplano","Mauro Franceschelli","Karl H. Johansson"],"pdf_url":"https://arxiv.org/pdf/2309.00520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04866v3","updated":"2023-09-01T15:10:54Z","published":"2023-05-04T23:23:47Z","title":"Causal Policy Gradient for Whole-Body Mobile Manipulation","summary":" Developing the next generation of household robot helpers requires combining\nlocomotion and interaction capabilities, which is generally referred to as\nmobile manipulation (MoMa). MoMa tasks are difficult due to the large action\nspace of the robot and the common multi-objective nature of the task, e.g.,\nefficiently reaching a goal while avoiding obstacles. Current approaches often\nsegregate tasks into navigation without manipulation and stationary\nmanipulation without locomotion by manually matching parts of the action space\nto MoMa sub-objectives (e.g. base actions for locomotion objectives and arm\nactions for manipulation). This solution prevents simultaneous combinations of\nlocomotion and interaction degrees of freedom and requires human domain\nknowledge for both partitioning the action space and matching the action parts\nto the sub-objectives. In this paper, we introduce Causal MoMa, a new framework\nto train policies for typical MoMa tasks that makes use of the most favorable\nsubspace of the robot's action space to address each sub-objective. Causal MoMa\nautomatically discovers the causal dependencies between actions and terms of\nthe reward function and exploits these dependencies in a causal policy learning\nprocedure that reduces gradient variance compared to previous state-of-the-art\npolicy gradient algorithms, improving convergence and results. We evaluate the\nperformance of Causal MoMa on three types of simulated robots across different\nMoMa tasks and demonstrate success in transferring the policies trained in\nsimulation directly to a real robot, where our agent is able to follow moving\ngoals and react to dynamic obstacles while simultaneously and synergistically\ncontrolling the whole-body: base, arm, and head. More information at\nhttps://sites.google.com/view/causal-moma.\n","authors":["Jiaheng Hu","Peter Stone","Roberto Martín-Martín"],"pdf_url":"https://arxiv.org/pdf/2305.04866v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00508v1","updated":"2023-09-01T14:53:51Z","published":"2023-09-01T14:53:51Z","title":"Structure and Gradient Dynamics Near Global Minima of Two-layer Neural\n Networks","summary":" Under mild assumptions, we investigate the structure of loss landscape of\ntwo-layer neural networks near global minima, determine the set of parameters\nwhich give perfect generalization, and fully characterize the gradient flows\naround it. With novel techniques, our work uncovers some simple aspects of the\ncomplicated loss landscape and reveals how model, target function, samples and\ninitialization affect the training dynamics differently. Based on these\nresults, we also explain why (overparametrized) neural networks could\ngeneralize well.\n","authors":["Leyang Zhang","Yaoyu Zhang","Tao Luo"],"pdf_url":"https://arxiv.org/pdf/2309.00508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00498v1","updated":"2023-09-01T14:42:27Z","published":"2023-09-01T14:42:27Z","title":"Application of Deep Learning Methods in Monitoring and Optimization of\n Electric Power Systems","summary":" This PhD thesis thoroughly examines the utilization of deep learning\ntechniques as a means to advance the algorithms employed in the monitoring and\noptimization of electric power systems. The first major contribution of this\nthesis involves the application of graph neural networks to enhance power\nsystem state estimation. The second key aspect of this thesis focuses on\nutilizing reinforcement learning for dynamic distribution network\nreconfiguration. The effectiveness of the proposed methods is affirmed through\nextensive experimentation and simulations.\n","authors":["Ognjen Kundacina"],"pdf_url":"https://arxiv.org/pdf/2309.00498v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2309.00494v1","updated":"2023-09-01T14:40:25Z","published":"2023-09-01T14:40:25Z","title":"Multi-stage Deep Learning Artifact Reduction for Computed Tomography","summary":" In Computed Tomography (CT), an image of the interior structure of an object\nis computed from a set of acquired projection images. The quality of these\nreconstructed images is essential for accurate analysis, but this quality can\nbe degraded by a variety of imaging artifacts. To improve reconstruction\nquality, the acquired projection images are often processed by a pipeline\nconsisting of multiple artifact-removal steps applied in various image domains\n(e.g., outlier removal on projection images and denoising of reconstruction\nimages). These artifact-removal methods exploit the fact that certain artifacts\nare easier to remove in a certain domain compared with other domains.\n Recently, deep learning methods have shown promising results for artifact\nremoval for CT images. However, most existing deep learning methods for CT are\napplied as a post-processing method after reconstruction. Therefore, artifacts\nthat are relatively difficult to remove in the reconstruction domain may not be\neffectively removed by these methods. As an alternative, we propose a\nmulti-stage deep learning method for artifact removal, in which neural networks\nare applied to several domains, similar to a classical CT processing pipeline.\nWe show that the neural networks can be effectively trained in succession,\nresulting in easy-to-use and computationally efficient training. Experiments on\nboth simulated and real-world experimental datasets show that our method is\neffective in reducing artifacts and superior to deep learning-based\npost-processing.\n","authors":["Jiayang Shi","Daniel M. Pelt","K. Joost Batenburg"],"pdf_url":"https://arxiv.org/pdf/2309.00494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00489v1","updated":"2023-09-01T14:30:04Z","published":"2023-09-01T14:30:04Z","title":"How Does Forecasting Affect the Convergence of DRL Techniques in O-RAN\n Slicing?","summary":" The success of immersive applications such as virtual reality (VR) gaming and\nmetaverse services depends on low latency and reliable connectivity. To provide\nseamless user experiences, the open radio access network (O-RAN) architecture\nand 6G networks are expected to play a crucial role. RAN slicing, a critical\ncomponent of the O-RAN paradigm, enables network resources to be allocated\nbased on the needs of immersive services, creating multiple virtual networks on\na single physical infrastructure. In the O-RAN literature, deep reinforcement\nlearning (DRL) algorithms are commonly used to optimize resource allocation.\nHowever, the practical adoption of DRL in live deployments has been sluggish.\nThis is primarily due to the slow convergence and performance instabilities\nsuffered by the DRL agents both upon initial deployment and when there are\nsignificant changes in network conditions. In this paper, we investigate the\nimpact of time series forecasting of traffic demands on the convergence of the\nDRL-based slicing agents. For that, we conduct an exhaustive experiment that\nsupports multiple services including real VR gaming traffic. We then propose a\nnovel forecasting-aided DRL approach and its respective O-RAN practical\ndeployment workflow to enhance DRL convergence. Our approach shows up to 22.8%,\n86.3%, and 300% improvements in the average initial reward value, convergence\nrate, and number of converged scenarios respectively, enhancing the\ngeneralizability of the DRL agents compared with the implemented baselines. The\nresults also indicate that our approach is robust against forecasting errors\nand that forecasting models do not have to be ideal.\n","authors":["Ahmad M. Nagib","Hatem Abou-Zeid","Hossam S. Hassanein"],"pdf_url":"https://arxiv.org/pdf/2309.00489v1.pdf","comment":"This article has been accepted for presentation in IEEE GLOBECOM 2023"},{"id":"http://arxiv.org/abs/2309.00483v1","updated":"2023-09-01T14:20:48Z","published":"2023-09-01T14:20:48Z","title":"Geometry-aware Line Graph Transformer Pre-training for Molecular\n Property Prediction","summary":" Molecular property prediction with deep learning has gained much attention\nover the past years. Owing to the scarcity of labeled molecules, there has been\ngrowing interest in self-supervised learning methods that learn generalizable\nmolecular representations from unlabeled data. Molecules are typically treated\nas 2D topological graphs in modeling, but it has been discovered that their 3D\ngeometry is of great importance in determining molecular functionalities. In\nthis paper, we propose the Geometry-aware line graph transformer (Galformer)\npre-training, a novel self-supervised learning framework that aims to enhance\nmolecular representation learning with 2D and 3D modalities. Specifically, we\nfirst design a dual-modality line graph transformer backbone to encode the\ntopological and geometric information of a molecule. The designed backbone\nincorporates effective structural encodings to capture graph structures from\nboth modalities. Then we devise two complementary pre-training tasks at the\ninter and intra-modality levels. These tasks provide properly supervised\ninformation and extract discriminative 2D and 3D knowledge from unlabeled\nmolecules. Finally, we evaluate Galformer against six state-of-the-art\nbaselines on twelve property prediction benchmarks via downstream fine-tuning.\nExperimental results show that Galformer consistently outperforms all baselines\non both classification and regression tasks, demonstrating its effectiveness.\n","authors":["Peizhen Bai","Xianyuan Liu","Haiping Lu"],"pdf_url":"https://arxiv.org/pdf/2309.00483v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.06961v2","updated":"2023-09-01T14:09:44Z","published":"2023-08-14T06:32:52Z","title":"Graph Structural Residuals: A Learning Approach to Diagnosis","summary":" Traditional model-based diagnosis relies on constructing explicit system\nmodels, a process that can be laborious and expertise-demanding. In this paper,\nwe propose a novel framework that combines concepts of model-based diagnosis\nwith deep graph structure learning. This data-driven approach leverages data to\nlearn the system's underlying structure and provide dynamic observations,\nrepresented by two distinct graph adjacency matrices. Our work facilitates a\nseamless integration of graph structure learning with model-based diagnosis by\nmaking three main contributions: (i) redefining the constructs of system\nrepresentation, observations, and faults (ii) introducing two distinct versions\nof a self-supervised graph structure learning model architecture and (iii)\ndemonstrating the potential of our data-driven diagnostic method through\nexperiments on a system of coupled oscillators.\n","authors":["Jan Lukas Augustin","Oliver Niggemann"],"pdf_url":"https://arxiv.org/pdf/2308.06961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14040v2","updated":"2023-09-01T14:00:39Z","published":"2023-03-24T14:51:06Z","title":"Euler Characteristic Tools For Topological Data Analysis","summary":" In this article, we study Euler characteristic techniques in topological data\nanalysis. Pointwise computing the Euler characteristic of a family of\nsimplicial complexes built from data gives rise to the so-called Euler\ncharacteristic profile. We show that this simple descriptor achieve\nstate-of-the-art performance in supervised tasks at a very low computational\ncost. Inspired by signal analysis, we compute hybrid transforms of Euler\ncharacteristic profiles. These integral transforms mix Euler characteristic\ntechniques with Lebesgue integration to provide highly efficient compressors of\ntopological signals. As a consequence, they show remarkable performances in\nunsupervised settings. On the qualitative side, we provide numerous heuristics\non the topological and geometric information captured by Euler profiles and\ntheir hybrid transforms. Finally, we prove stability results for these\ndescriptors as well as asymptotic guarantees in random settings.\n","authors":["Olympio Hacquard","Vadim Lebovici"],"pdf_url":"https://arxiv.org/pdf/2303.14040v2.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2309.00462v1","updated":"2023-09-01T13:53:33Z","published":"2023-09-01T13:53:33Z","title":"New metrics for analyzing continual learners","summary":" Deep neural networks have shown remarkable performance when trained on\nindependent and identically distributed data from a fixed set of classes.\nHowever, in real-world scenarios, it can be desirable to train models on a\ncontinuous stream of data where multiple classification tasks are presented\nsequentially. This scenario, known as Continual Learning (CL) poses challenges\nto standard learning algorithms which struggle to maintain knowledge of old\ntasks while learning new ones. This stability-plasticity dilemma remains\ncentral to CL and multiple metrics have been proposed to adequately measure\nstability and plasticity separately. However, none considers the increasing\ndifficulty of the classification task, which inherently results in performance\nloss for any model. In that sense, we analyze some limitations of current\nmetrics and identify the presence of setup-induced forgetting. Therefore, we\npropose new metrics that account for the task's increasing difficulty. Through\nexperiments on benchmark datasets, we demonstrate that our proposed metrics can\nprovide new insights into the stability-plasticity trade-off achieved by models\nin the continual learning environment.\n","authors":["Nicolas Michel","Giovanni Chierchia","Romain Negrel","Jean-François Bercher","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2309.00462v1.pdf","comment":"6 pages, presented at MIRU 2023"},{"id":"http://arxiv.org/abs/2003.08429v4","updated":"2023-09-01T13:25:14Z","published":"2020-03-18T18:40:52Z","title":"STEm-Seg: Spatio-temporal Embeddings for Instance Segmentation in Videos","summary":" Existing methods for instance segmentation in videos typically involve\nmulti-stage pipelines that follow the tracking-by-detection paradigm and model\na video clip as a sequence of images. Multiple networks are used to detect\nobjects in individual frames, and then associate these detections over time.\nHence, these methods are often non-end-to-end trainable and highly tailored to\nspecific tasks. In this paper, we propose a different approach that is\nwell-suited to a variety of tasks involving instance segmentation in videos. In\nparticular, we model a video clip as a single 3D spatio-temporal volume, and\npropose a novel approach that segments and tracks instances across space and\ntime in a single stage. Our problem formulation is centered around the idea of\nspatio-temporal embeddings which are trained to cluster pixels belonging to a\nspecific object instance over an entire video clip. To this end, we introduce\n(i) novel mixing functions that enhance the feature representation of\nspatio-temporal embeddings, and (ii) a single-stage, proposal-free network that\ncan reason about temporal context. Our network is trained end-to-end to learn\nspatio-temporal embeddings as well as parameters required to cluster these\nembeddings, thus simplifying inference. Our method achieves state-of-the-art\nresults across multiple datasets and tasks. Code and models are available at\nhttps://github.com/sabarim/STEm-Seg.\n","authors":["Ali Athar","Sabarinath Mahadevan","Aljoša Ošep","Laura Leal-Taixé","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2003.08429v4.pdf","comment":"ECCV 2020 28 pages, 6 figures"},{"id":"http://arxiv.org/abs/2304.05257v2","updated":"2023-09-01T13:09:15Z","published":"2023-04-11T14:46:38Z","title":"Multi-granulariy Time-based Transformer for Knowledge Tracing","summary":" In this paper, we present a transformer architecture for predicting student\nperformance on standardized tests. Specifically, we leverage students\nhistorical data, including their past test scores, study habits, and other\nrelevant information, to create a personalized model for each student. We then\nuse these models to predict their future performance on a given test. Applying\nthis model to the RIIID dataset, we demonstrate that using multiple\ngranularities for temporal features as the decoder input significantly improve\nmodel performance. Our results also show the effectiveness of our approach,\nwith substantial improvements over the LightGBM method. Our work contributes to\nthe growing field of AI in education, providing a scalable and accurate tool\nfor predicting student outcomes.\n","authors":["Tong Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.05257v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16805v2","updated":"2023-09-01T12:53:44Z","published":"2023-06-29T09:35:53Z","title":"CLIPAG: Towards Generator-Free Text-to-Image Generation","summary":" Perceptually Aligned Gradients (PAG) refer to an intriguing property observed\nin robust image classification models, wherein their input gradients align with\nhuman perception and pose semantic meanings. While this phenomenon has gained\nsignificant research attention, it was solely studied in the context of\nunimodal vision-only architectures. In this work, we extend the study of PAG to\nVision-Language architectures, which form the foundations for diverse\nimage-text tasks and applications. Through an adversarial robustification\nfinetuning of CLIP, we demonstrate that robust Vision-Language models exhibit\nPAG in contrast to their vanilla counterparts. This work reveals the merits of\nCLIP with PAG (CLIPAG) in several vision-language generative tasks. Notably, we\nshow that seamlessly integrating CLIPAG in a \"plug-n-play\" manner leads to\nsubstantial improvements in vision-language generative applications.\nFurthermore, leveraging its PAG property, CLIPAG enables text-to-image\ngeneration without any generative model, which typically requires huge\ngenerators.\n","authors":["Roy Ganz","Michael Elad"],"pdf_url":"https://arxiv.org/pdf/2306.16805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16259v2","updated":"2023-09-01T12:40:29Z","published":"2023-08-30T18:34:55Z","title":"Materials Informatics Transformer: A Language Model for Interpretable\n Materials Properties Prediction","summary":" Recently, the remarkable capabilities of large language models (LLMs) have\nbeen illustrated across a variety of research domains such as natural language\nprocessing, computer vision, and molecular modeling. We extend this paradigm by\nutilizing LLMs for material property prediction by introducing our model\nMaterials Informatics Transformer (MatInFormer). Specifically, we introduce a\nnovel approach that involves learning the grammar of crystallography through\nthe tokenization of pertinent space group information. We further illustrate\nthe adaptability of MatInFormer by incorporating task-specific data pertaining\nto Metal-Organic Frameworks (MOFs). Through attention visualization, we uncover\nthe key features that the model prioritizes during property prediction. The\neffectiveness of our proposed model is empirically validated across 14 distinct\ndatasets, hereby underscoring its potential for high throughput screening\nthrough accurate material property prediction.\n","authors":["Hongshuo Huang","Rishikesh Magar","Changwen Xu","Amir Barati Farimani"],"pdf_url":"https://arxiv.org/pdf/2308.16259v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00428v1","updated":"2023-09-01T12:40:17Z","published":"2023-09-01T12:40:17Z","title":"A Locality-based Neural Solver for Optical Motion Capture","summary":" We present a novel locality-based learning method for cleaning and solving\noptical motion capture data. Given noisy marker data, we propose a new\nheterogeneous graph neural network which treats markers and joints as different\ntypes of nodes, and uses graph convolution operations to extract the local\nfeatures of markers and joints and transform them to clean motions. To deal\nwith anomaly markers (e.g. occluded or with big tracking errors), the key\ninsight is that a marker's motion shows strong correlations with the motions of\nits immediate neighboring markers but less so with other markers, a.k.a.\nlocality, which enables us to efficiently fill missing markers (e.g. due to\nocclusion). Additionally, we also identify marker outliers due to tracking\nerrors by investigating their acceleration profiles. Finally, we propose a\ntraining regime based on representation learning and data augmentation, by\ntraining the model on data with masking. The masking schemes aim to mimic the\noccluded and noisy markers often observed in the real data. Finally, we show\nthat our method achieves high accuracy on multiple metrics across various\ndatasets. Extensive comparison shows our method outperforms state-of-the-art\nmethods in terms of prediction accuracy of occluded marker position error by\napproximately 20%, which leads to a further error reduction on the\nreconstructed joint rotations and positions by 30%. The code and data for this\npaper are available at https://github.com/non-void/LocalMoCap.\n","authors":["Xiaoyu Pan","Bowen Zheng","Xinwei Jiang","Guanglong Xu","Xianli Gu","Jingxiang Li","Qilong Kou","He Wang","Tianjia Shao","Kun Zhou","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2309.00428v1.pdf","comment":"Siggraph Asia 2023 Conference Paper"},{"id":"http://arxiv.org/abs/2309.00422v1","updated":"2023-09-01T12:31:39Z","published":"2023-09-01T12:31:39Z","title":"Declarative Reasoning on Explanations Using Constraint Logic Programming","summary":" Explaining opaque Machine Learning (ML) models is an increasingly relevant\nproblem. Current explanation in AI (XAI) methods suffer several shortcomings,\namong others an insufficient incorporation of background knowledge, and a lack\nof abstraction and interactivity with the user. We propose REASONX, an\nexplanation method based on Constraint Logic Programming (CLP). REASONX can\nprovide declarative, interactive explanations for decision trees, which can be\nthe ML models under analysis or global/local surrogate models of any black-box\nmodel. Users can express background or common sense knowledge using linear\nconstraints and MILP optimization over features of factual and contrastive\ninstances, and interact with the answer constraints at different levels of\nabstraction through constraint projection. We present here the architecture of\nREASONX, which consists of a Python layer, closer to the user, and a CLP layer.\nREASONX's core execution engine is a Prolog meta-program with declarative\nsemantics in terms of logic theories.\n","authors":["Laura State","Salvatore Ruggieri","Franco Turini"],"pdf_url":"https://arxiv.org/pdf/2309.00422v1.pdf","comment":"European Conference on Logics in Artificial Intelligence (JELIA 2023)"},{"id":"http://arxiv.org/abs/2309.00417v1","updated":"2023-09-01T12:20:30Z","published":"2023-09-01T12:20:30Z","title":"Area-norm COBRA on Conditional Survival Prediction","summary":" The paper explores a different variation of combined regression strategy to\ncalculate the conditional survival function. We use regression based weak\nlearners to create the proposed ensemble technique. The proposed combined\nregression strategy uses proximity measure as area between two survival curves.\nThe proposed model shows a construction which ensures that it performs better\nthan the Random Survival Forest. The paper discusses a novel technique to\nselect the most important variable in the combined regression setup. We perform\na simulation study to show that our proposition for finding relevance of the\nvariables works quite well. We also use three real-life datasets to illustrate\nthe model.\n","authors":["Rahul Goswami","Arabin Kr. Dey"],"pdf_url":"https://arxiv.org/pdf/2309.00417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00416v1","updated":"2023-09-01T12:20:19Z","published":"2023-09-01T12:20:19Z","title":"Advancing Personalized Federated Learning: Group Privacy, Fairness, and\n Beyond","summary":" Federated learning (FL) is a framework for training machine learning models\nin a distributed and collaborative manner. During training, a set of\nparticipating clients process their data stored locally, sharing only the model\nupdates obtained by minimizing a cost function over their local inputs. FL was\nproposed as a stepping-stone towards privacy-preserving machine learning, but\nit has been shown vulnerable to issues such as leakage of private information,\nlack of personalization of the model, and the possibility of having a trained\nmodel that is fairer to some groups than to others. In this paper, we address\nthe triadic interaction among personalization, privacy guarantees, and fairness\nattained by models trained within the FL framework. Differential privacy and\nits variants have been studied and applied as cutting-edge standards for\nproviding formal privacy guarantees. However, clients in FL often hold very\ndiverse datasets representing heterogeneous communities, making it important to\nprotect their sensitive information while still ensuring that the trained model\nupholds the aspect of fairness for the users. To attain this objective, a\nmethod is put forth that introduces group privacy assurances through the\nutilization of $d$-privacy (aka metric privacy). $d$-privacy represents a\nlocalized form of differential privacy that relies on a metric-oriented\nobfuscation approach to maintain the original data's topological distribution.\nThis method, besides enabling personalized model training in a federated\napproach and providing formal privacy guarantees, possesses significantly\nbetter group fairness measured under a variety of standard metrics than a\nglobal model trained within a classical FL template. Theoretical justifications\nfor the applicability are provided, as well as experimental validation on\nreal-world datasets to illustrate the working of the proposed method.\n","authors":["Filippo Galli","Kangsoo Jung","Sayan Biswas","Catuscia Palamidessi","Tommaso Cucinotta"],"pdf_url":"https://arxiv.org/pdf/2309.00416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00410v1","updated":"2023-09-01T12:07:40Z","published":"2023-09-01T12:07:40Z","title":"Selective Scene Text Removal","summary":" Scene text removal (STR) is the image transformation task to remove text\nregions in scene images. The conventional STR methods remove all scene text.\nThis means that the existing methods cannot select text to be removed. In this\npaper, we propose a novel task setting named selective scene text removal\n(SSTR) that removes only target words specified by the user. Although SSTR is a\nmore complex task than STR, the proposed multi-module structure enables\nefficient training for SSTR. Experimental results show that the proposed method\ncan remove target words as expected.\n","authors":["Hayato Mitani","Akisato Kimura","Seiichi Uchida"],"pdf_url":"https://arxiv.org/pdf/2309.00410v1.pdf","comment":"12 pages, 8 figures, Accepted at the 34th British Machine Vision\n Conference"},{"id":"http://arxiv.org/abs/2308.14705v2","updated":"2023-09-01T11:38:56Z","published":"2023-08-28T16:58:44Z","title":"Diversified Ensemble of Independent Sub-Networks for Robust\n Self-Supervised Representation Learning","summary":" Ensembling a neural network is a widely recognized approach to enhance model\nperformance, estimate uncertainty, and improve robustness in deep supervised\nlearning. However, deep ensembles often come with high computational costs and\nmemory demands. In addition, the efficiency of a deep ensemble is related to\ndiversity among the ensemble members which is challenging for large,\nover-parameterized deep neural networks. Moreover, ensemble learning has not\nyet seen such widespread adoption, and it remains a challenging endeavor for\nself-supervised or unsupervised representation learning. Motivated by these\nchallenges, we present a novel self-supervised training regime that leverages\nan ensemble of independent sub-networks, complemented by a new loss function\ndesigned to encourage diversity. Our method efficiently builds a sub-model\nensemble with high diversity, leading to well-calibrated estimates of model\nuncertainty, all achieved with minimal computational overhead compared to\ntraditional deep self-supervised ensembles. To evaluate the effectiveness of\nour approach, we conducted extensive experiments across various tasks,\nincluding in-distribution generalization, out-of-distribution detection,\ndataset corruption, and semi-supervised settings. The results demonstrate that\nour method significantly improves prediction reliability. Our approach not only\nachieves excellent accuracy but also enhances calibration, surpassing baseline\nperformance across a wide range of self-supervised architectures in computer\nvision, natural language processing, and genomics data.\n","authors":["Amirhossein Vahidi","Lisa Wimmer","Hüseyin Anil Gündüz","Bernd Bischl","Eyke Hüllermeier","Mina Rezaei"],"pdf_url":"https://arxiv.org/pdf/2308.14705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2003.06307v2","updated":"2023-09-01T11:18:38Z","published":"2020-03-10T05:42:44Z","title":"Communication-Efficient Distributed Deep Learning: A Comprehensive\n Survey","summary":" Distributed deep learning (DL) has become prevalent in recent years to reduce\ntraining time by leveraging multiple computing devices (e.g., GPUs/TPUs) due to\nlarger models and datasets. However, system scalability is limited by\ncommunication becoming the performance bottleneck. Addressing this\ncommunication issue has become a prominent research topic. In this paper, we\nprovide a comprehensive survey of the communication-efficient distributed\ntraining algorithms, focusing on both system-level and algorithmic-level\noptimizations. We first propose a taxonomy of data-parallel distributed\ntraining algorithms that incorporates four primary dimensions: communication\nsynchronization, system architectures, compression techniques, and parallelism\nof communication and computing tasks. We then investigate state-of-the-art\nstudies that address problems in these four dimensions. We also compare the\nconvergence rates of different algorithms to understand their convergence\nspeed. Additionally, we conduct extensive experiments to empirically compare\nthe convergence performance of various mainstream distributed training\nalgorithms. Based on our system-level communication cost analysis, theoretical\nand experimental convergence speed comparison, we provide readers with an\nunderstanding of which algorithms are more efficient under specific distributed\nenvironments. Our research also extrapolates potential directions for further\noptimizations.\n","authors":["Zhenheng Tang","Shaohuai Shi","Wei Wang","Bo Li","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2003.06307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.01078v2","updated":"2023-09-01T10:49:08Z","published":"2021-01-04T16:38:00Z","title":"Topology-aware Tensor Decomposition for Meta-graph Learning","summary":" Heterogeneous graphs generally refers to graphs with different types of nodes\nand edges. A common approach for extracting useful information from\nheterogeneous graphs is to use meta-graphs, which can be seen as a special kind\nof directed acyclic graph (DAG) with same node and edge types as the\nheterogeneous graph. However, how to design proper meta-graphs is challenging.\nRecently, there have been many works on learning suitable meta-graphs from a\nheterogeneous graph. Existing methods generally introduce continuous weights\nfor edges that are independent of each other, which ignores the topological\nstucture of meta-graphs and can be ineffective. To address this issue, we\npropose a new viewpoint from tensor on learning meta-graphs. Such a viewpoint\nnot only helps interpret the limitation of existing works by CANDECOMP/PARAFAC\n(CP) decomposition, but also inspires us to propose a topology-aware tensor\ndecomposition, called TENSUS, that reflects the structure of DAGs. The proposed\ntopology-aware tensor decomposition is easy to use and simple to implement, and\nit can be taken as a plug-in part to upgrade many existing works, including\nnode classification and recommendation on heterogeneous graphs. Experimental\nresults on different tasks demonstrate that the proposed method can\nsignificantly improve the state-of-the-arts for all these tasks.\n","authors":["Hansi Yang","Peiyu Zhang","Quanming Yao"],"pdf_url":"https://arxiv.org/pdf/2101.01078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00380v1","updated":"2023-09-01T10:32:21Z","published":"2023-09-01T10:32:21Z","title":"Learning multi-modal generative models with permutation-invariant\n encoders and tighter variational bounds","summary":" Devising deep latent variable models for multi-modal data has been a\nlong-standing theme in machine learning research. Multi-modal Variational\nAutoencoders (VAEs) have been a popular generative model class that learns\nlatent representations which jointly explain multiple modalities. Various\nobjective functions for such models have been suggested, often motivated as\nlower bounds on the multi-modal data log-likelihood or from\ninformation-theoretic considerations. In order to encode latent variables from\ndifferent modality subsets, Product-of-Experts (PoE) or Mixture-of-Experts\n(MoE) aggregation schemes have been routinely used and shown to yield different\ntrade-offs, for instance, regarding their generative quality or consistency\nacross multiple modalities. In this work, we consider a variational bound that\ncan tightly lower bound the data log-likelihood. We develop more flexible\naggregation schemes that generalise PoE or MoE approaches by combining encoded\nfeatures from different modalities based on permutation-invariant neural\nnetworks. Our numerical experiments illustrate trade-offs for multi-modal\nvariational bounds and various aggregation schemes. We show that tighter\nvariational bounds and more flexible aggregation models can become beneficial\nwhen one wants to approximate the true joint distribution over observed\nmodalities and latent variables in identifiable models.\n","authors":["Marcel Hirt","Domenico Campolo","Victoria Leong","Juan-Pablo Ortega"],"pdf_url":"https://arxiv.org/pdf/2309.00380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00379v1","updated":"2023-09-01T10:30:48Z","published":"2023-09-01T10:30:48Z","title":"Anomaly detection with semi-supervised classification based on risk\n estimators","summary":" A significant limitation of one-class classification anomaly detection\nmethods is their reliance on the assumption that unlabeled training data only\ncontains normal instances. To overcome this impractical assumption, we propose\ntwo novel classification-based anomaly detection methods. Firstly, we introduce\na semi-supervised shallow anomaly detection method based on an unbiased risk\nestimator. Secondly, we present a semi-supervised deep anomaly detection method\nutilizing a nonnegative (biased) risk estimator. We establish estimation error\nbounds and excess risk bounds for both risk minimizers. Additionally, we\npropose techniques to select appropriate regularization parameters that ensure\nthe nonnegativity of the empirical risk in the shallow model under specific\nloss functions. Our extensive experiments provide strong evidence of the\neffectiveness of the risk-based anomaly detection methods.\n","authors":["Le Thi Khanh Hien","Sukanya Patra","Souhaib Ben Taieb"],"pdf_url":"https://arxiv.org/pdf/2309.00379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10328v2","updated":"2023-09-01T10:16:59Z","published":"2023-08-20T17:52:02Z","title":"A Comprehensive Empirical Evaluation on Online Continual Learning","summary":" Online continual learning aims to get closer to a live learning experience by\nlearning directly on a stream of data with temporally shifting distribution and\nby storing a minimum amount of data from that stream. In this empirical\nevaluation, we evaluate various methods from the literature that tackle online\ncontinual learning. More specifically, we focus on the class-incremental\nsetting in the context of image classification, where the learner must learn\nnew classes incrementally from a stream of data. We compare these methods on\nthe Split-CIFAR100 and Split-TinyImagenet benchmarks, and measure their average\naccuracy, forgetting, stability, and quality of the representations, to\nevaluate various aspects of the algorithm at the end but also during the whole\ntraining period. We find that most methods suffer from stability and\nunderfitting issues. However, the learned representations are comparable to\ni.i.d. training under the same computational budget. No clear winner emerges\nfrom the results and basic experience replay, when properly tuned and\nimplemented, is a very strong baseline. We release our modular and extensible\ncodebase at https://github.com/AlbinSou/ocl_survey based on the avalanche\nframework to reproduce our results and encourage future research.\n","authors":["Albin Soutif--Cormerais","Antonio Carta","Andrea Cossu","Julio Hurtado","Hamed Hemati","Vincenzo Lomonaco","Joost Van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2308.10328v2.pdf","comment":"ICCV Visual Continual Learning Workshop 2023 accepted paper"},{"id":"http://arxiv.org/abs/2308.16891v2","updated":"2023-09-01T10:00:11Z","published":"2023-08-31T17:52:10Z","title":"GNFactor: Multi-Task Real Robot Learning with Generalizable Neural\n Feature Fields","summary":" It is a long-standing problem in robotics to develop agents capable of\nexecuting diverse manipulation tasks from visual observations in unstructured\nreal-world environments. To achieve this goal, the robot needs to have a\ncomprehensive understanding of the 3D structure and semantics of the scene. In\nthis work, we present $\\textbf{GNFactor}$, a visual behavior cloning agent for\nmulti-task robotic manipulation with $\\textbf{G}$eneralizable $\\textbf{N}$eural\nfeature $\\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural\nfield (GNF) as a reconstruction module and a Perceiver Transformer as a\ndecision-making module, leveraging a shared deep 3D voxel representation. To\nincorporate semantics in 3D, the reconstruction module utilizes a\nvision-language foundation model ($\\textit{e.g.}$, Stable Diffusion) to distill\nrich semantic information into the deep 3D voxel. We evaluate GNFactor on 3\nreal robot tasks and perform detailed ablations on 10 RLBench tasks with a\nlimited number of demonstrations. We observe a substantial improvement of\nGNFactor over current state-of-the-art methods in seen and unseen tasks,\ndemonstrating the strong generalization ability of GNFactor. Our project\nwebsite is https://yanjieze.com/GNFactor/ .\n","authors":["Yanjie Ze","Ge Yan","Yueh-Hua Wu","Annabella Macaluso","Yuying Ge","Jianglong Ye","Nicklas Hansen","Li Erran Li","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16891v2.pdf","comment":"CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/"},{"id":"http://arxiv.org/abs/2207.00026v4","updated":"2023-09-01T09:58:15Z","published":"2022-06-30T18:00:04Z","title":"LaserMix for Semi-Supervised LiDAR Semantic Segmentation","summary":" Densely annotating LiDAR point clouds is costly, which restrains the\nscalability of fully-supervised learning methods. In this work, we study the\nunderexplored semi-supervised learning (SSL) in LiDAR segmentation. Our core\nidea is to leverage the strong spatial cues of LiDAR point clouds to better\nexploit unlabeled data. We propose LaserMix to mix laser beams from different\nLiDAR scans, and then encourage the model to make consistent and confident\npredictions before and after mixing. Our framework has three appealing\nproperties: 1) Generic: LaserMix is agnostic to LiDAR representations (e.g.,\nrange view and voxel), and hence our SSL framework can be universally applied.\n2) Statistically grounded: We provide a detailed analysis to theoretically\nexplain the applicability of the proposed framework. 3) Effective:\nComprehensive experimental analysis on popular LiDAR segmentation datasets\n(nuScenes, SemanticKITTI, and ScribbleKITTI) demonstrates our effectiveness and\nsuperiority. Notably, we achieve competitive results over fully-supervised\ncounterparts with 2x to 5x fewer labels and improve the supervised-only\nbaseline significantly by 10.8% on average. We hope this concise yet\nhigh-performing framework could facilitate future research in semi-supervised\nLiDAR segmentation. Code is publicly available.\n","authors":["Lingdong Kong","Jiawei Ren","Liang Pan","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2207.00026v4.pdf","comment":"CVPR 2023 (Highlight); 27 pages, 11 figures, 12 tables; Code at\n https://github.com/ldkong1205/LaserMix"},{"id":"http://arxiv.org/abs/2309.00367v1","updated":"2023-09-01T09:47:33Z","published":"2023-09-01T09:47:33Z","title":"Where Did the Gap Go? Reassessing the Long-Range Graph Benchmark","summary":" The recent Long-Range Graph Benchmark (LRGB, Dwivedi et al. 2022) introduced\na set of graph learning tasks strongly dependent on long-range interaction\nbetween vertices. Empirical evidence suggests that on these tasks Graph\nTransformers significantly outperform Message Passing GNNs (MPGNNs). In this\npaper, we carefully reevaluate multiple MPGNN baselines as well as the Graph\nTransformer GPS (Ramp\\'a\\v{s}ek et al. 2022) on LRGB. Through a rigorous\nempirical analysis, we demonstrate that the reported performance gap is\noverestimated due to suboptimal hyperparameter choices. It is noteworthy that\nacross multiple datasets the performance gap completely vanishes after basic\nhyperparameter optimization. In addition, we discuss the impact of lacking\nfeature normalization for LRGB's vision datasets and highlight a spurious\nimplementation of LRGB's link prediction metric. The principal aim of our paper\nis to establish a higher standard of empirical rigor within the graph machine\nlearning community.\n","authors":["Jan Tönshoff","Martin Ritzert","Eran Rosenbluth","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2309.00367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00363v1","updated":"2023-09-01T09:40:36Z","published":"2023-09-01T09:40:36Z","title":"FederatedScope-LLM: A Comprehensive Package for Fine-tuning Large\n Language Models in Federated Learning","summary":" LLMs have demonstrated great capabilities in various NLP tasks. Different\nentities can further improve the performance of those LLMs on their specific\ndownstream tasks by fine-tuning LLMs. When several entities have similar\ninterested tasks, but their data cannot be shared because of privacy concerns\nregulations, federated learning (FL) is a mainstream solution to leverage the\ndata of different entities. However, fine-tuning LLMs in federated learning\nsettings still lacks adequate support from existing FL frameworks because it\nhas to deal with optimizing the consumption of significant communication and\ncomputational resources, data preparation for different tasks, and distinct\ninformation protection demands. This paper first discusses these challenges of\nfederated fine-tuning LLMs, and introduces our package FS-LLM as a main\ncontribution, which consists of the following components: (1) we build an\nend-to-end benchmarking pipeline, automizing the processes of dataset\npreprocessing, federated fine-tuning execution, and performance evaluation on\nfederated LLM fine-tuning; (2) we provide comprehensive federated\nparameter-efficient fine-tuning algorithm implementations and versatile\nprogramming interfaces for future extension in FL scenarios with low\ncommunication and computation costs, even without accessing the full model; (3)\nwe adopt several accelerating and resource-efficient operators for fine-tuning\nLLMs with limited resources and the flexible pluggable sub-routines for\ninterdisciplinary study. We conduct extensive experiments to validate the\neffectiveness of FS-LLM and benchmark advanced LLMs with state-of-the-art\nparameter-efficient fine-tuning algorithms in FL settings, which also yields\nvaluable insights into federated fine-tuning LLMs for the research community.\nTo facilitate further research and adoption, we release FS-LLM at\nhttps://github.com/alibaba/FederatedScope/tree/llm.\n","authors":["Weirui Kuang","Bingchen Qian","Zitao Li","Daoyuan Chen","Dawei Gao","Xuchen Pan","Yuexiang Xie","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.00363v1.pdf","comment":"Source code: https://github.com/alibaba/FederatedScope/tree/llm"},{"id":"http://arxiv.org/abs/2211.04088v3","updated":"2023-09-01T09:37:06Z","published":"2022-11-08T08:39:30Z","title":"A Penalty-Based Method for Communication-Efficient Decentralized Bilevel\n Programming","summary":" Bilevel programming has recently received attention in the literature, due to\nits wide range of applications, including reinforcement learning and\nhyper-parameter optimization. However, it is widely assumed that the underlying\nbilevel optimization problem is solved either by a single machine or in the\ncase of multiple machines connected in a star-shaped network, i.e., federated\nlearning setting. The latter approach suffers from a high communication cost on\nthe central node (e.g., parameter server) and exhibits privacy vulnerabilities.\nHence, it is of interest to develop methods that solve bilevel optimization\nproblems in a communication-efficient decentralized manner. To that end, this\npaper introduces a penalty function based decentralized algorithm with\ntheoretical guarantees for this class of optimization problems. Specifically, a\ndistributed alternating gradient-type algorithm for solving consensus bilevel\nprogramming over a decentralized network is developed. A key feature of the\nproposed algorithm is to estimate the hyper-gradient of the penalty function\nvia decentralized computation of matrix-vector products and few vector\ncommunications, which is then integrated within an alternating algorithm to\nobtain finite-time convergence analysis under different convexity assumptions.\nOur theoretical result highlights improvements in the iteration complexity of\ndecentralized bilevel optimization, all while making efficient use of vector\ncommunication. Empirical results on both synthetic and real datasets\ndemonstrate that the proposed method performs well in real-world settings.\n","authors":["Parvin Nazari","Ahmad Mousavi","Davoud Ataee Tarzanagh","George Michailidis"],"pdf_url":"https://arxiv.org/pdf/2211.04088v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00356v1","updated":"2023-09-01T09:22:33Z","published":"2023-09-01T09:22:33Z","title":"Explainable Active Learning for Preference Elicitation","summary":" Gaining insights into the preferences of new users and subsequently\npersonalizing recommendations necessitate managing user interactions\nintelligently, namely, posing pertinent questions to elicit valuable\ninformation effectively. In this study, our focus is on a specific scenario of\nthe cold-start problem, where the recommendation system lacks adequate user\npresence or access to other users' data is restricted, obstructing employing\nuser profiling methods utilizing existing data in the system. We employ Active\nLearning (AL) to solve the addressed problem with the objective of maximizing\ninformation acquisition with minimal user effort. AL operates for selecting\ninformative data from a large unlabeled set to inquire an oracle to label them\nand eventually updating a machine learning (ML) model. We operate AL in an\nintegrated process of unsupervised, semi-supervised, and supervised ML within\nan explanatory preference elicitation process. It harvests user feedback (given\nfor the system's explanations on the presented items) over informative samples\nto update an underlying ML model estimating user preferences. The designed user\ninteraction facilitates personalizing the system by incorporating user feedback\ninto the ML model and also enhances user trust by refining the system's\nexplanations on recommendations. We implement the proposed preference\nelicitation methodology for food recommendation. We conducted human experiments\nto assess its efficacy in the short term and also experimented with several AL\nstrategies over synthetic user profiles that we created for two food datasets,\naiming for long-term performance analysis. The experimental results demonstrate\nthe efficiency of the proposed preference elicitation with limited user-labeled\ndata while also enhancing user trust through accurate explanations.\n","authors":["Furkan Cantürk","Reyhan Aydoğan"],"pdf_url":"https://arxiv.org/pdf/2309.00356v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2210.15448v2","updated":"2023-09-01T09:18:17Z","published":"2022-10-19T15:04:44Z","title":"Neural Augmented Kalman Filtering with Bollinger Bands for Pairs Trading","summary":" Pairs trading is a family of trading techniques that determine their policies\nbased on monitoring the relationships between pairs of assets. A common pairs\ntrading approach relies on describing the pair-wise relationship as a linear\nSpace State (SS) model with Gaussian noise. This representation facilitates\nextracting financial indicators with low complexity and latency using a Kalman\nFilter (KF), that are then processed using classic policies such as Bollinger\nBands (BB). However, such SS models are inherently approximated and mismatched,\noften degrading the revenue. In this work, we propose KalmenNet-aided Bollinger\nbands Pairs Trading (KBPT), a deep learning aided policy that augments the\noperation of KF-aided BB trading. KBPT is designed by formulating an extended\nSS model for pairs trading that approximates their relationship as holding\npartial co-integration. This SS model is utilized by a trading policy that\naugments KF-BB trading with a dedicated neural network based on the KalmanNet\narchitecture. The resulting KBPT is trained in a two-stage manner which first\ntunes the tracking algorithm in an unsupervised manner independently of the\ntrading task, followed by its adaptation to track the financial indicators to\nmaximize revenue while approximating BB with a differentiable mapping. KBPT\nthus leverages data to overcome the approximated nature of the SS model,\nconverting the KF-BB policy into a trainable model. We empirically demonstrate\nthat our proposed KBPT systematically yields improved revenue compared with\nmodel-based and data-driven benchmarks over various different assets.\n","authors":["Amit Milstein","Haoran Deng","Guy Revach","Hai Morgenstern","Nir Shlezinger"],"pdf_url":"https://arxiv.org/pdf/2210.15448v2.pdf","comment":"Submitted to Transactions on Signal Processing"},{"id":"http://arxiv.org/abs/2309.00349v1","updated":"2023-09-01T09:15:04Z","published":"2023-09-01T09:15:04Z","title":"Bespoke Nanoparticle Synthesis and Chemical Knowledge Discovery Via\n Autonomous Experimentations","summary":" The optimization of nanomaterial synthesis using numerous synthetic variables\nis considered to be extremely laborious task because the conventional\ncombinatorial explorations are prohibitively expensive. In this work, we report\nan autonomous experimentation platform developed for the bespoke design of\nnanoparticles (NPs) with targeted optical properties. This platform operates in\na closed-loop manner between a batch synthesis module of NPs and a UV- Vis\nspectroscopy module, based on the feedback of the AI optimization modeling.\nWith silver (Ag) NPs as a representative example, we demonstrate that the\nBayesian optimizer implemented with the early stopping criterion can\nefficiently produce Ag NPs precisely possessing the desired absorption spectra\nwithin only 200 iterations (when optimizing among five synthetic reagents). In\naddition to the outstanding material developmental efficiency, the analysis of\nsynthetic variables further reveals a novel chemistry involving the effects of\ncitrate in Ag NP synthesis. The amount of citrate is a key to controlling the\ncompetitions between spherical and plate-shaped NPs and, as a result, affects\nthe shapes of the absorption spectra as well. Our study highlights both\ncapabilities of the platform to enhance search efficiencies and to provide a\nnovel chemical knowledge by analyzing datasets accumulated from the autonomous\nexperimentations.\n","authors":["Hyuk Jun Yoo","Nayeon Kim","Heeseung Lee","Daeho Kim","Leslie Tiong Ching Ow","Hyobin Nam","Chansoo Kim","Seung Yong Lee","Kwan-Young Lee","Donghun Kim","Sang Soo Han"],"pdf_url":"https://arxiv.org/pdf/2309.00349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00330v1","updated":"2023-09-01T08:34:13Z","published":"2023-09-01T08:34:13Z","title":"Multitask Deep Learning for Accurate Risk Stratification and Prediction\n of Next Steps for Coronary CT Angiography Patients","summary":" Diagnostic investigation has an important role in risk stratification and\nclinical decision making of patients with suspected and documented Coronary\nArtery Disease (CAD). However, the majority of existing tools are primarily\nfocused on the selection of gatekeeper tests, whereas only a handful of systems\ncontain information regarding the downstream testing or treatment. We propose a\nmulti-task deep learning model to support risk stratification and down-stream\ntest selection for patients undergoing Coronary Computed Tomography Angiography\n(CCTA). The analysis included 14,021 patients who underwent CCTA between 2006\nand 2017. Our novel multitask deep learning framework extends the state-of-the\nart Perceiver model to deal with real-world CCTA report data. Our model\nachieved an Area Under the receiver operating characteristic Curve (AUC) of\n0.76 in CAD risk stratification, and 0.72 AUC in predicting downstream tests.\nOur proposed deep learning model can accurately estimate the likelihood of CAD\nand provide recommended downstream tests based on prior CCTA data. In clinical\npractice, the utilization of such an approach could bring a paradigm shift in\nrisk stratification and downstream management. Despite significant progress\nusing deep learning models for tabular data, they do not outperform gradient\nboosting decision trees, and further research is required in this area.\nHowever, neural networks appear to benefit more readily from multi-task\nlearning than tree-based models. This could offset the shortcomings of using\nsingle task learning approach when working with tabular data.\n","authors":["Juan Lu","Mohammed Bennamoun","Jonathon Stewart","JasonK. Eshraghian","Yanbin Liu","Benjamin Chow","Frank M. Sanfilippo","Girish Dwivedi"],"pdf_url":"https://arxiv.org/pdf/2309.00330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00329v1","updated":"2023-09-01T08:31:35Z","published":"2023-09-01T08:31:35Z","title":"Mi-Go: Test Framework which uses YouTube as Data Source for Evaluating\n Speech Recognition Models like OpenAI's Whisper","summary":" This article introduces Mi-Go, a novel testing framework aimed at evaluating\nthe performance and adaptability of general-purpose speech recognition machine\nlearning models across diverse real-world scenarios. The framework leverages\nYouTube as a rich and continuously updated data source, accounting for multiple\nlanguages, accents, dialects, speaking styles, and audio quality levels. To\ndemonstrate the effectiveness of the framework, the Whisper model, developed by\nOpenAI, was employed as a test object. The tests involve using a total of 124\nYouTube videos to test all Whisper model versions. The results underscore the\nutility of YouTube as a valuable testing platform for speech recognition\nmodels, ensuring their robustness, accuracy, and adaptability to diverse\nlanguages and acoustic conditions. Additionally, by contrasting the\nmachine-generated transcriptions against human-made subtitles, the Mi-Go\nframework can help pinpoint potential misuse of YouTube subtitles, like Search\nEngine Optimization.\n","authors":["Tomasz Wojnar","Jaroslaw Hryszko","Adam Roman"],"pdf_url":"https://arxiv.org/pdf/2309.00329v1.pdf","comment":"25 pages, 9 tables, 3 figures"},{"id":"http://arxiv.org/abs/2309.00325v1","updated":"2023-09-01T08:16:53Z","published":"2023-09-01T08:16:53Z","title":"Multi-fidelity reduced-order surrogate modeling","summary":" High-fidelity numerical simulations of partial differential equations (PDEs)\ngiven a restricted computational budget can significantly limit the number of\nparameter configurations considered and/or time window evaluated for modeling a\ngiven system. Multi-fidelity surrogate modeling aims to leverage less accurate,\nlower-fidelity models that are computationally inexpensive in order to enhance\npredictive accuracy when high-fidelity data are limited or scarce. However,\nlow-fidelity models, while often displaying important qualitative\nspatio-temporal features, fail to accurately capture the onset of instability\nand critical transients observed in the high-fidelity models, making them\nimpractical as surrogate models. To address this shortcoming, we present a new\ndata-driven strategy that combines dimensionality reduction with multi-fidelity\nneural network surrogates. The key idea is to generate a spatial basis by\napplying the classical proper orthogonal decomposition (POD) to high-fidelity\nsolution snapshots, and approximate the dynamics of the reduced states -\ntime-parameter-dependent expansion coefficients of the POD basis - using a\nmulti-fidelity long-short term memory (LSTM) network. By mapping low-fidelity\nreduced states to their high-fidelity counterpart, the proposed reduced-order\nsurrogate model enables the efficient recovery of full solution fields over\ntime and parameter variations in a non-intrusive manner. The generality and\nrobustness of this method is demonstrated by a collection of parametrized,\ntime-dependent PDE problems where the low-fidelity model can be defined by\ncoarser meshes and/or time stepping, as well as by misspecified physical\nfeatures. Importantly, the onset of instabilities and transients are well\ncaptured by this surrogate modeling technique.\n","authors":["Paolo Conti","Mengwu Guo","Andrea Manzoni","Attilio Frangi","Steven L. Brunton","J. Nathan Kutz"],"pdf_url":"https://arxiv.org/pdf/2309.00325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.03160v2","updated":"2023-09-01T08:13:48Z","published":"2022-08-05T13:34:33Z","title":"Almost-Orthogonal Layers for Efficient General-Purpose Lipschitz\n Networks","summary":" It is a highly desirable property for deep networks to be robust against\nsmall input changes. One popular way to achieve this property is by designing\nnetworks with a small Lipschitz constant. In this work, we propose a new\ntechnique for constructing such Lipschitz networks that has a number of\ndesirable properties: it can be applied to any linear network layer\n(fully-connected or convolutional), it provides formal guarantees on the\nLipschitz constant, it is easy to implement and efficient to run, and it can be\ncombined with any training objective and optimization method. In fact, our\ntechnique is the first one in the literature that achieves all of these\nproperties simultaneously. Our main contribution is a rescaling-based weight\nmatrix parametrization that guarantees each network layer to have a Lipschitz\nconstant of at most 1 and results in the learned weight matrices to be close to\northogonal. Hence we call such layers almost-orthogonal Lipschitz (AOL).\nExperiments and ablation studies in the context of image classification with\ncertified robust accuracy confirm that AOL layers achieve results that are on\npar with most existing methods. Yet, they are simpler to implement and more\nbroadly applicable, because they do not require computationally expensive\nmatrix orthogonalization or inversion steps as part of the network\narchitecture. We provide code at https://github.com/berndprach/AOL.\n","authors":["Bernd Prach","Christoph H. Lampert"],"pdf_url":"https://arxiv.org/pdf/2208.03160v2.pdf","comment":"- Corrected the results from competitor ECO. - Corrected a typo in\n the loss function equation"},{"id":"http://arxiv.org/abs/2301.11259v4","updated":"2023-09-01T07:50:44Z","published":"2023-01-26T17:52:56Z","title":"Domain-Agnostic Molecular Generation with Self-feedback","summary":" The generation of molecules with desired properties has gained tremendous\npopularity, revolutionizing the way scientists design molecular structures and\nproviding valuable support for chemical and drug design. However, despite the\npotential of language models in molecule generation, they face numerous\nchallenges such as the generation of syntactically or chemically flawed\nmolecules, narrow domain focus, and limitations in creating diverse and\ndirectionally feasible molecules due to a dearth of annotated data or external\nmolecular databases. To this end, we introduce MolGen, a pre-trained molecular\nlanguage model tailored specifically for molecule generation. MolGen acquires\nintrinsic structural and grammatical insights by reconstructing over 100\nmillion molecular SELFIES, while facilitating knowledge transfer between\ndifferent domains through domain-agnostic molecular prefix tuning. Moreover, we\npresent a self-feedback paradigm that inspires the pre-trained model to align\nwith the ultimate goal of producing molecules with desirable properties.\nExtensive experiments on well-known benchmarks confirm MolGen's optimization\ncapabilities, encompassing penalized logP, QED, and molecular docking\nproperties. Further analysis shows that MolGen can accurately capture molecule\ndistributions, implicitly learn their structural characteristics, and\nefficiently explore chemical space. The pre-trained model, codes, and datasets\nare publicly available for future research at https://github.com/zjunlp/MolGen.\n","authors":["Yin Fang","Ningyu Zhang","Zhuo Chen","Xiaohui Fan","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.11259v4.pdf","comment":"Work in progress. Add results of binding affinity"},{"id":"http://arxiv.org/abs/2303.06261v3","updated":"2023-09-01T07:49:21Z","published":"2023-03-11T00:53:49Z","title":"Interpretable Outlier Summarization","summary":" Outlier detection is critical in real applications to prevent financial\nfraud, defend network intrusions, or detecting imminent device failures. To\nreduce the human effort in evaluating outlier detection results and effectively\nturn the outliers into actionable insights, the users often expect a system to\nautomatically produce interpretable summarizations of subgroups of outlier\ndetection results. Unfortunately, to date no such systems exist. To fill this\ngap, we propose STAIR which learns a compact set of human understandable rules\nto summarize and explain the anomaly detection results. Rather than use the\nclassical decision tree algorithms to produce these rules, STAIR proposes a new\noptimization objective to produce a small number of rules with least\ncomplexity, hence strong interpretability, to accurately summarize the\ndetection results. The learning algorithm of STAIR produces a rule set by\niteratively splitting the large rules and is optimal in maximizing this\nobjective in each iteration. Moreover, to effectively handle high dimensional,\nhighly complex data sets which are hard to summarize with simple rules, we\npropose a localized STAIR approach, called L-STAIR. Taking data locality into\nconsideration, it simultaneously partitions data and learns a set of localized\nrules for each partition. Our experimental study on many outlier benchmark\ndatasets shows that STAIR significantly reduces the complexity of the rules\nrequired to summarize the outlier detection results, thus more amenable for\nhumans to understand and evaluate, compared to the decision tree methods.\n","authors":["Yu Wang","Lei Cao","Yizhou Yan","Samuel Madden"],"pdf_url":"https://arxiv.org/pdf/2303.06261v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.11749v2","updated":"2023-09-01T07:46:27Z","published":"2021-03-22T12:02:00Z","title":"Simulation comparisons between Bayesian and de-biased estimators in\n low-rank matrix completion","summary":" In this paper, we study the low-rank matrix completion problem, a class of\nmachine learning problems, that aims at the prediction of missing entries in a\npartially observed matrix. Such problems appear in several challenging\napplications such as collaborative filtering, image processing, and genotype\nimputation. We compare the Bayesian approaches and a recently introduced\nde-biased estimator which provides a useful way to build confidence intervals\nof interest. From a theoretical viewpoint, the de-biased estimator comes with a\nsharp minimax-optimal rate of estimation error whereas the Bayesian approach\nreaches this rate with an additional logarithmic factor. Our simulation studies\nshow originally interesting results that the de-biased estimator is just as\ngood as the Bayesian estimators. Moreover, Bayesian approaches are much more\nstable and can outperform the de-biased estimator in the case of small samples.\nIn addition, we also find that the empirical coverage rate of the confidence\nintervals obtained by the de-biased estimator for an entry is absolutely lower\nthan of the considered credible interval. These results suggest further\ntheoretical studies on the estimation error and the concentration of Bayesian\nmethods as they are quite limited up to present.\n","authors":["The Tien Mai"],"pdf_url":"https://arxiv.org/pdf/2103.11749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16737v2","updated":"2023-09-01T07:30:02Z","published":"2023-08-31T13:54:37Z","title":"Robust Networked Federated Learning for Localization","summary":" This paper addresses the problem of localization, which is inherently\nnon-convex and non-smooth in a federated setting where the data is distributed\nacross a multitude of devices. Due to the decentralized nature of federated\nenvironments, distributed learning becomes essential for scalability and\nadaptability. Moreover, these environments are often plagued by outlier data,\nwhich presents substantial challenges to conventional methods, particularly in\nmaintaining estimation accuracy and ensuring algorithm convergence. To mitigate\nthese challenges, we propose a method that adopts an $L_1$-norm robust\nformulation within a distributed sub-gradient framework, explicitly designed to\nhandle these obstacles. Our approach addresses the problem in its original\nform, without resorting to iterative simplifications or approximations,\nresulting in enhanced computational efficiency and improved estimation\naccuracy. We demonstrate that our method converges to a stationary point,\nhighlighting its effectiveness and reliability. Through numerical simulations,\nwe confirm the superior performance of our approach, notably in outlier-rich\nenvironments, which surpasses existing state-of-the-art localization methods.\n","authors":["Reza Mirzaeifard","Naveen K. D. Venkategowda","Stefan Werner"],"pdf_url":"https://arxiv.org/pdf/2308.16737v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00305v1","updated":"2023-09-01T07:29:44Z","published":"2023-09-01T07:29:44Z","title":"Efficient Surrogate Models for Materials Science Simulations: Machine\n Learning-based Prediction of Microstructure Properties","summary":" Determining, understanding, and predicting the so-called structure-property\nrelation is an important task in many scientific disciplines, such as\nchemistry, biology, meteorology, physics, engineering, and materials science.\nStructure refers to the spatial distribution of, e.g., substances, material, or\nmatter in general, while property is a resulting characteristic that usually\ndepends in a non-trivial way on spatial details of the structure.\nTraditionally, forward simulations models have been used for such tasks.\nRecently, several machine learning algorithms have been applied in these\nscientific fields to enhance and accelerate simulation models or as surrogate\nmodels. In this work, we develop and investigate the applications of six\nmachine learning techniques based on two different datasets from the domain of\nmaterials science: data from a two-dimensional Ising model for predicting the\nformation of magnetic domains and data representing the evolution of dual-phase\nmicrostructures from the Cahn-Hilliard model. We analyze the accuracy and\nrobustness of all models and elucidate the reasons for the differences in their\nperformances. The impact of including domain knowledge through tailored\nfeatures is studied, and general recommendations based on the availability and\nquality of training data are derived from this.\n","authors":["Binh Duong Nguyen","Pavlo Potapenko","Aytekin Dermici","Kishan Govinda","Stefan Sandfeld"],"pdf_url":"https://arxiv.org/pdf/2309.00305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16818v2","updated":"2023-09-01T07:27:52Z","published":"2023-08-31T15:49:21Z","title":"Irregular Traffic Time Series Forecasting Based on Asynchronous\n Spatio-Temporal Graph Convolutional Network","summary":" Accurate traffic forecasting at intersections governed by intelligent traffic\nsignals is critical for the advancement of an effective intelligent traffic\nsignal control system. However, due to the irregular traffic time series\nproduced by intelligent intersections, the traffic forecasting task becomes\nmuch more intractable and imposes three major new challenges: 1) asynchronous\nspatial dependency, 2) irregular temporal dependency among traffic data, and 3)\nvariable-length sequence to be predicted, which severely impede the performance\nof current traffic forecasting methods. To this end, we propose an Asynchronous\nSpatio-tEmporal graph convolutional nEtwoRk (ASeer) to predict the traffic\nstates of the lanes entering intelligent intersections in a future time window.\nSpecifically, by linking lanes via a traffic diffusion graph, we first propose\nan Asynchronous Graph Diffusion Network to model the asynchronous spatial\ndependency between the time-misaligned traffic state measurements of lanes.\nAfter that, to capture the temporal dependency within irregular traffic state\nsequence, a learnable personalized time encoding is devised to embed the\ncontinuous time for each lane. Then we propose a Transformable Time-aware\nConvolution Network that learns meta-filters to derive time-aware convolution\nfilters with transformable filter sizes for efficient temporal convolution on\nthe irregular sequence. Furthermore, a Semi-Autoregressive Prediction Network\nconsisting of a state evolution unit and a semiautoregressive predictor is\ndesigned to effectively and efficiently predict variable-length traffic state\nsequences. Extensive experiments on two real-world datasets demonstrate the\neffectiveness of ASeer in six metrics.\n","authors":["Weijia Zhang","Le Zhang","Jindong Han","Hao Liu","Jingbo Zhou","Yu Mei","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.16818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11288v2","updated":"2023-09-01T07:17:54Z","published":"2023-08-22T08:57:44Z","title":"Test Time Embedding Normalization for Popularity Bias Mitigation","summary":" Popularity bias is a widespread problem in the field of recommender systems,\nwhere popular items tend to dominate recommendation results. In this work, we\npropose 'Test Time Embedding Normalization' as a simple yet effective strategy\nfor mitigating popularity bias, which surpasses the performance of the previous\nmitigation approaches by a significant margin. Our approach utilizes the\nnormalized item embedding during the inference stage to control the influence\nof embedding magnitude, which is highly correlated with item popularity.\nThrough extensive experiments, we show that our method combined with the\nsampled softmax loss effectively reduces popularity bias compare to previous\napproaches for bias mitigation. We further investigate the relationship between\nuser and item embeddings and find that the angular similarity between\nembeddings distinguishes preferable and non-preferable items regardless of\ntheir popularity. The analysis explains the mechanism behind the success of our\napproach in eliminating the impact of popularity bias. Our code is available at\nhttps://github.com/ml-postech/TTEN.\n","authors":["Dain Kim","Jinhyeok Park","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11288v2.pdf","comment":"5 pages, CIKM 2023"},{"id":"http://arxiv.org/abs/2302.05629v2","updated":"2023-09-01T07:09:55Z","published":"2023-02-11T08:58:55Z","title":"Improving Differentiable Architecture Search via Self-Distillation","summary":" Differentiable Architecture Search (DARTS) is a simple yet efficient Neural\nArchitecture Search (NAS) method. During the search stage, DARTS trains a\nsupernet by jointly optimizing architecture parameters and network parameters.\nDuring the evaluation stage, DARTS discretizes the supernet to derive the\noptimal architecture based on architecture parameters. However, recent research\nhas shown that during the training process, the supernet tends to converge\ntowards sharp minima rather than flat minima. This is evidenced by the higher\nsharpness of the loss landscape of the supernet, which ultimately leads to a\nperformance gap between the supernet and the optimal architecture. In this\npaper, we propose Self-Distillation Differentiable Neural Architecture Search\n(SD-DARTS) to alleviate the discretization gap. We utilize self-distillation to\ndistill knowledge from previous steps of the supernet to guide its training in\nthe current step, effectively reducing the sharpness of the supernet's loss and\nbridging the performance gap between the supernet and the optimal architecture.\nFurthermore, we introduce the concept of voting teachers, where multiple\nprevious supernets are selected as teachers, and their output probabilities are\naggregated through voting to obtain the final teacher prediction. Experimental\nresults on real datasets demonstrate the advantages of our novel\nself-distillation-based NAS method compared to state-of-the-art alternatives.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2302.05629v2.pdf","comment":"Accepted by Neural Networks"},{"id":"http://arxiv.org/abs/2212.01168v2","updated":"2023-09-01T07:08:25Z","published":"2022-12-02T13:47:21Z","title":"Identifying Generalized Neural Representation Across Hamiltonian\n Manifolds via Meta-learning","summary":" Recent advancements in deep learning for physics have focused on discovering\nshared representations of target systems by incorporating physics priors or\ninductive biases into neural networks. However, these approaches are\nsystem-specific and do not allow for easy adaptation to new physical systems\ngoverned by different laws. For example, a neural network trained on a\nmass-spring system cannot accurately predict the behavior of a two-body system\nor any other system with different governing physics. In this work, we model\nour system with a graph neural network and employ a meta-learning algorithm to\nenable the model to gain experience over a distribution of tasks and make it\nadapt to new physics. Our approach aims to learn a general representation\nacross the various Hamiltonian manifolds, which is a common feature of the data\ndistribution of Hamiltonian systems. We train our model using a dataset of\ndifferent physical systems, each governed by its own inherent dynamics, and\nevaluate its performance on a new type of dynamical system with unknown\nphysics. Our results show that the meta-trained model effectively adapts to the\nnew system, which was unseen during the meta-training phase. Furthermore, we\nanalyze the representation learned by the meta-trained neural network to\nidentify a generalizable representation of Hamilton's equation that is shared\nacross various physical systems. Our findings suggest that the meta-learned\nmodel can capture the generalizable representation across Hamiltonian manifolds\ninherent in dynamical systems.\n","authors":["Yeongwoo Song","Hawoong Jeong"],"pdf_url":"https://arxiv.org/pdf/2212.01168v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2303.08440v2","updated":"2023-09-01T07:04:30Z","published":"2023-03-15T08:28:06Z","title":"Improving 3D Imaging with Pre-Trained Perpendicular 2D Diffusion Models","summary":" Diffusion models have become a popular approach for image generation and\nreconstruction due to their numerous advantages. However, most diffusion-based\ninverse problem-solving methods only deal with 2D images, and even recently\npublished 3D methods do not fully exploit the 3D distribution prior. To address\nthis, we propose a novel approach using two perpendicular pre-trained 2D\ndiffusion models to solve the 3D inverse problem. By modeling the 3D data\ndistribution as a product of 2D distributions sliced in different directions,\nour method effectively addresses the curse of dimensionality. Our experimental\nresults demonstrate that our method is highly effective for 3D medical image\nreconstruction tasks, including MRI Z-axis super-resolution, compressed sensing\nMRI, and sparse-view CT. Our method can generate high-quality voxel volumes\nsuitable for medical applications.\n","authors":["Suhyeon Lee","Hyungjin Chung","Minyoung Park","Jonghyuk Park","Wi-Sun Ryu","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2303.08440v2.pdf","comment":"ICCV23 poster. 15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.00296v1","updated":"2023-09-01T07:03:05Z","published":"2023-09-01T07:03:05Z","title":"End-to-end Lidar-Driven Reinforcement Learning for Autonomous Racing","summary":" Reinforcement Learning (RL) has emerged as a transformative approach in the\ndomains of automation and robotics, offering powerful solutions to complex\nproblems that conventional methods struggle to address. In scenarios where the\nproblem definitions are elusive and challenging to quantify, learning-based\nsolutions such as RL become particularly valuable. One instance of such\ncomplexity can be found in the realm of car racing, a dynamic and unpredictable\nenvironment that demands sophisticated decision-making algorithms. This study\nfocuses on developing and training an RL agent to navigate a racing environment\nsolely using feedforward raw lidar and velocity data in a simulated context.\nThe agent's performance, trained in the simulation environment, is then\nexperimentally evaluated in a real-world racing scenario. This exploration\nunderlines the feasibility and potential benefits of RL algorithm enhancing\nautonomous racing performance, especially in the environments where prior map\ninformation is not available.\n","authors":["Meraj Mammadov"],"pdf_url":"https://arxiv.org/pdf/2309.00296v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2308.13970v2","updated":"2023-09-01T06:42:53Z","published":"2023-08-26T22:54:45Z","title":"FAM: fast adaptive federated meta-learning","summary":" In this work, we propose a fast adaptive federated meta-learning (FAM)\nframework for collaboratively learning a single global model, which can then be\npersonalized locally on individual clients. Federated learning enables multiple\nclients to collaborate to train a model without sharing data. Clients with\ninsufficient data or data diversity participate in federated learning to learn\na model with superior performance. Nonetheless, learning suffers when data\ndistributions diverge. There is a need to learn a global model that can be\nadapted using client's specific information to create personalized models on\nclients is required. MRI data suffers from this problem, wherein, one, due to\ndata acquisition challenges, local data at a site is sufficient for training an\naccurate model and two, there is a restriction of data sharing due to privacy\nconcerns and three, there is a need for personalization of a learnt shared\nglobal model on account of domain shift across client sites. The global model\nis sparse and captures the common features in the MRI. This skeleton network is\ngrown on each client to train a personalized model by learning additional\nclient-specific parameters from local data. Experimental results show that the\npersonalization process at each client quickly converges using a limited number\nof epochs. The personalized client models outperformed the locally trained\nmodels, demonstrating the efficacy of the FAM mechanism. Additionally, the\nsparse parameter set to be communicated during federated learning drastically\nreduced communication overhead, which makes the scheme viable for networks with\nlimited resources.\n","authors":["Indrajeet Kumar Sinha","Shekhar Verma","Krishna Pratap Singh"],"pdf_url":"https://arxiv.org/pdf/2308.13970v2.pdf","comment":"13 Pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.13570v3","updated":"2023-09-01T06:23:18Z","published":"2023-08-25T05:52:41Z","title":"Stochastic Configuration Machines for Industrial Artificial Intelligence","summary":" Real-time predictive modelling with desired accuracy is highly expected in\nindustrial artificial intelligence (IAI), where neural networks play a key\nrole. Neural networks in IAI require powerful, high-performance computing\ndevices to operate a large number of floating point data. Based on stochastic\nconfiguration networks (SCNs), this paper proposes a new randomized learner\nmodel, termed stochastic configuration machines (SCMs), to stress effective\nmodelling and data size saving that are useful and valuable for industrial\napplications. Compared to SCNs and random vector functional-link (RVFL) nets\nwith binarized implementation, the model storage of SCMs can be significantly\ncompressed while retaining favourable prediction performance. Besides the\narchitecture of the SCM learner model and its learning algorithm, as an\nimportant part of this contribution, we also provide a theoretical basis on the\nlearning capacity of SCMs by analysing the model's complexity. Experimental\nstudies are carried out over some benchmark datasets and three industrial\napplications. The results demonstrate that SCM has great potential for dealing\nwith industrial data analytics.\n","authors":["Dianhui Wang","Matthew J. Felicetti"],"pdf_url":"https://arxiv.org/pdf/2308.13570v3.pdf","comment":"23 pages, 7 figures, 12 tables"},{"id":"http://arxiv.org/abs/2307.13484v2","updated":"2023-09-01T05:59:00Z","published":"2023-07-25T13:21:07Z","title":"Rational kernel-based interpolation for complex-valued frequency\n response functions","summary":" This work is concerned with the kernel-based approximation of a\ncomplex-valued function from data, where the frequency response function of a\npartial differential equation in the frequency domain is of particular\ninterest. In this setting, kernel methods are employed more and more\nfrequently, however, standard kernels do not perform well. Moreover, the role\nand mathematical implications of the underlying pair of kernels, which arises\nnaturally in the complex-valued case, remain to be addressed. We introduce new\nreproducing kernel Hilbert spaces of complex-valued functions, and formulate\nthe problem of complex-valued interpolation with a kernel pair as minimum norm\ninterpolation in these spaces. Moreover, we combine the interpolant with a\nlow-order rational function, where the order is adaptively selected based on a\nnew model selection criterion. Numerical results on examples from different\nfields, including electromagnetics and acoustic examples, illustrate the\nperformance of the method, also in comparison to available rational\napproximation methods.\n","authors":["Julien Bect","Niklas Georg","Ulrich Römer","Sebastian Schöps"],"pdf_url":"https://arxiv.org/pdf/2307.13484v2.pdf","comment":"26 pages main paper, 6 pages supplement"},{"id":"http://arxiv.org/abs/2309.00267v1","updated":"2023-09-01T05:53:33Z","published":"2023-09-01T05:53:33Z","title":"RLAIF: Scaling Reinforcement Learning from Human Feedback with AI\n Feedback","summary":" Reinforcement learning from human feedback (RLHF) is effective at aligning\nlarge language models (LLMs) to human preferences, but gathering high quality\nhuman preference labels is a key bottleneck. We conduct a head-to-head\ncomparison of RLHF vs. RL from AI Feedback (RLAIF) - a technique where\npreferences are labeled by an off-the-shelf LLM in lieu of humans, and we find\nthat they result in similar improvements. On the task of summarization, human\nevaluators prefer generations from both RLAIF and RLHF over a baseline\nsupervised fine-tuned model in ~70% of cases. Furthermore, when asked to rate\nRLAIF vs. RLHF summaries, humans prefer both at equal rates. These results\nsuggest that RLAIF can yield human-level performance, offering a potential\nsolution to the scalability limitations of RLHF.\n","authors":["Harrison Lee","Samrat Phatale","Hassan Mansoor","Kellie Lu","Thomas Mesnard","Colton Bishop","Victor Carbune","Abhinav Rastogi"],"pdf_url":"https://arxiv.org/pdf/2309.00267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00257v1","updated":"2023-09-01T05:25:05Z","published":"2023-09-01T05:25:05Z","title":"Leveraging Learning Metrics for Improved Federated Learning","summary":" Currently in the federated setting, no learning schemes leverage the emerging\nresearch of explainable artificial intelligence (XAI) in particular the novel\nlearning metrics that help determine how well a model is learning. One of these\nnovel learning metrics is termed `Effective Rank' (ER) which measures the\nShannon Entropy of the singular values of a matrix, thus enabling a metric\ndetermining how well a layer is mapping. By joining federated learning and the\nlearning metric, effective rank, this work will \\textbf{(1)} give the first\nfederated learning metric aggregation method \\textbf{(2)} show that effective\nrank is well-suited to federated problems by out-performing baseline Federated\nAveraging \\cite{konevcny2016federated} and \\textbf{(3)} develop a novel\nweight-aggregation scheme relying on effective rank.\n","authors":["Andre Fu"],"pdf_url":"https://arxiv.org/pdf/2309.00257v1.pdf","comment":"Bachelor's thesis"},{"id":"http://arxiv.org/abs/2308.16245v2","updated":"2023-09-01T05:16:01Z","published":"2023-08-30T18:06:57Z","title":"Calibrated Explanations for Regression","summary":" Artificial Intelligence (AI) is often an integral part of modern decision\nsupport systems (DSSs). The best-performing predictive models used in AI-based\nDSSs lack transparency. Explainable Artificial Intelligence (XAI) aims to\ncreate AI systems that can explain their rationale to human users. Local\nexplanations in XAI can provide information about the causes of individual\npredictions in terms of feature importance. However, a critical drawback of\nexisting local explanation methods is their inability to quantify the\nuncertainty associated with a feature's importance. This paper introduces an\nextension of a feature importance explanation method, Calibrated Explanations\n(CE), previously only supporting classification, with support for standard\nregression and probabilistic regression, i.e., the probability that the target\nis above an arbitrary threshold. The extension for regression keeps all the\nbenefits of CE, such as calibration of the prediction from the underlying model\nwith confidence intervals, uncertainty quantification of feature importance,\nand allows both factual and counterfactual explanations. CE for standard\nregression provides fast, reliable, stable, and robust explanations. CE for\nprobabilistic regression provides an entirely new way of creating probabilistic\nexplanations from any ordinary regression model and with a dynamic selection of\nthresholds. The performance of CE for probabilistic regression regarding\nstability and speed is comparable to LIME. The method is model agnostic with\neasily understood conditional rules. An implementation in Python is freely\navailable on GitHub and for installation using pip making the results in this\npaper easily replicable.\n","authors":["Tuwe Löfström","Helena Löfström","Ulf Johansson","Cecilia Sönströd","Rudy Matela"],"pdf_url":"https://arxiv.org/pdf/2308.16245v2.pdf","comment":"30 pages, 11 figures (replaced due to omitted author, which is the\n only change made)"},{"id":"http://arxiv.org/abs/2309.00255v1","updated":"2023-09-01T05:12:25Z","published":"2023-09-01T05:12:25Z","title":"SortedNet, a Place for Every Network and Every Network in its Place:\n Towards a Generalized Solution for Training Many-in-One Neural Networks","summary":" As the size of deep learning models continues to grow, finding optimal models\nunder memory and computation constraints becomes increasingly more important.\nAlthough usually the architecture and constituent building blocks of neural\nnetworks allow them to be used in a modular way, their training process is not\naware of this modularity. Consequently, conventional neural network training\nlacks the flexibility to adapt the computational load of the model during\ninference. This paper proposes SortedNet, a generalized and scalable solution\nto harness the inherent modularity of deep neural networks across various\ndimensions for efficient dynamic inference. Our training considers a nested\narchitecture for the sub-models with shared parameters and trains them together\nwith the main model in a sorted and probabilistic manner. This sorted training\nof sub-networks enables us to scale the number of sub-networks to hundreds\nusing a single round of training. We utilize a novel updating scheme during\ntraining that combines random sampling of sub-networks with gradient\naccumulation to improve training efficiency. Furthermore, the sorted nature of\nour training leads to a search-free sub-network selection at inference time;\nand the nested architecture of the resulting sub-networks leads to minimal\nstorage requirement and efficient switching between sub-networks at inference.\nOur general dynamic training approach is demonstrated across various\narchitectures and tasks, including large language models and pre-trained vision\nmodels. Experimental results show the efficacy of the proposed approach in\nachieving efficient sub-networks while outperforming state-of-the-art dynamic\ntraining approaches. Our findings demonstrate the feasibility of training up to\n160 different sub-models simultaneously, showcasing the extensive scalability\nof our proposed method while maintaining 96% of the model performance.\n","authors":["Mojtaba Valipour","Mehdi Rezagholizadeh","Hossein Rajabzadeh","Marzieh Tahaei","Boxing Chen","Ali Ghodsi"],"pdf_url":"https://arxiv.org/pdf/2309.00255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00254v1","updated":"2023-09-01T05:09:49Z","published":"2023-09-01T05:09:49Z","title":"Why do universal adversarial attacks work on large language models?:\n Geometry might be the answer","summary":" Transformer based large language models with emergent capabilities are\nbecoming increasingly ubiquitous in society. However, the task of understanding\nand interpreting their internal workings, in the context of adversarial\nattacks, remains largely unsolved. Gradient-based universal adversarial attacks\nhave been shown to be highly effective on large language models and potentially\ndangerous due to their input-agnostic nature. This work presents a novel\ngeometric perspective explaining universal adversarial attacks on large\nlanguage models. By attacking the 117M parameter GPT-2 model, we find evidence\nindicating that universal adversarial triggers could be embedding vectors which\nmerely approximate the semantic information in their adversarial training\nregion. This hypothesis is supported by white-box model analysis comprising\ndimensionality reduction and similarity measurement of hidden representations.\nWe believe this new geometric perspective on the underlying mechanism driving\nuniversal attacks could help us gain deeper insight into the internal workings\nand failure modes of LLMs, thus enabling their mitigation.\n","authors":["Varshini Subhash","Anna Bialas","Weiwei Pan","Finale Doshi-Velez"],"pdf_url":"https://arxiv.org/pdf/2309.00254v1.pdf","comment":"2nd AdvML Frontiers Workshop at 40th International Conference on\n Machine Learning, Honolulu, Hawaii, USA, 2023"},{"id":"http://arxiv.org/abs/2306.09995v2","updated":"2023-09-01T05:04:28Z","published":"2023-06-16T17:47:36Z","title":"Fairness in Preference-based Reinforcement Learning","summary":" In this paper, we address the issue of fairness in preference-based\nreinforcement learning (PbRL) in the presence of multiple objectives. The main\nobjective is to design control policies that can optimize multiple objectives\nwhile treating each objective fairly. Toward this objective, we design a new\nfairness-induced preference-based reinforcement learning or FPbRL. The main\nidea of FPbRL is to learn vector reward functions associated with multiple\nobjectives via new welfare-based preferences rather than reward-based\npreference in PbRL, coupled with policy learning via maximizing a generalized\nGini welfare function. Finally, we provide experiment studies on three\ndifferent environments to show that the proposed FPbRL approach can achieve\nboth efficiency and equity for learning effective and fair policies.\n","authors":["Umer Siddique","Abhinav Sinha","Yongcan Cao"],"pdf_url":"https://arxiv.org/pdf/2306.09995v2.pdf","comment":"Accepted to The Many Facets of Preference Learning Workshop at the\n International Conference on Machine Learning (ICML)"},{"id":"http://arxiv.org/abs/2309.00252v1","updated":"2023-09-01T05:01:52Z","published":"2023-09-01T05:01:52Z","title":"Interpretable Medical Imagery Diagnosis with Self-Attentive\n Transformers: A Review of Explainable AI for Health Care","summary":" Recent advancements in artificial intelligence (AI) have facilitated its\nwidespread adoption in primary medical services, addressing the demand-supply\nimbalance in healthcare. Vision Transformers (ViT) have emerged as\nstate-of-the-art computer vision models, benefiting from self-attention\nmodules. However, compared to traditional machine-learning approaches,\ndeep-learning models are complex and are often treated as a \"black box\" that\ncan cause uncertainty regarding how they operate. Explainable Artificial\nIntelligence (XAI) refers to methods that explain and interpret machine\nlearning models' inner workings and how they come to decisions, which is\nespecially important in the medical domain to guide the healthcare\ndecision-making process. This review summarises recent ViT advancements and\ninterpretative approaches to understanding the decision-making process of ViT,\nenabling transparency in medical diagnosis applications.\n","authors":["Tin Lai"],"pdf_url":"https://arxiv.org/pdf/2309.00252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05092v2","updated":"2023-09-01T04:58:12Z","published":"2023-03-09T08:04:16Z","title":"Task Aware Dreamer for Task Generalization in Reinforcement Learning","summary":" A long-standing goal of reinforcement learning is to acquire agents that can\nlearn on training tasks and generalize well on unseen tasks that may share a\nsimilar dynamic but with different reward functions. A general challenge is to\nquantitatively measure the similarities between these different tasks, which is\nvital for analyzing the task distribution and further designing algorithms with\nstronger generalization. To address this, we present a novel metric named Task\nDistribution Relevance (TDR) via optimal Q functions of different tasks to\ncapture the relevance of the task distribution quantitatively. In the case of\ntasks with a high TDR, i.e., the tasks differ significantly, we show that the\nMarkovian policies cannot differentiate them, leading to poor performance.\nBased on this insight, we encode all historical information into policies for\ndistinguishing different tasks and propose Task Aware Dreamer (TAD), which\nextends world models into our reward-informed world models to capture invariant\nlatent features over different tasks. In TAD, we calculate the corresponding\nvariational lower bound of the data log-likelihood, including a novel term to\ndistinguish different tasks via states, to optimize reward-informed world\nmodels. Extensive experiments in both image-based control tasks and state-based\ncontrol tasks demonstrate that TAD can significantly improve the performance of\nhandling different tasks simultaneously, especially for those with high TDR,\nand demonstrate a strong generalization ability to unseen tasks.\n","authors":["Chengyang Ying","Zhongkai Hao","Xinning Zhou","Hang Su","Songming Liu","Dong Yan","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.05092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16835v2","updated":"2023-09-01T04:57:54Z","published":"2023-08-31T16:10:22Z","title":"FedDD: Toward Communication-efficient Federated Learning with\n Differential Parameter Dropout","summary":" Federated Learning (FL) requires frequent exchange of model parameters, which\nleads to long communication delay, especially when the network environments of\nclients vary greatly. Moreover, the parameter server needs to wait for the\nslowest client (i.e., straggler, which may have the largest model size, lowest\ncomputing capability or worst network condition) to upload parameters, which\nmay significantly degrade the communication efficiency. Commonly-used client\nselection methods such as partial client selection would lead to the waste of\ncomputing resources and weaken the generalization of the global model. To\ntackle this problem, along a different line, in this paper, we advocate the\napproach of model parameter dropout instead of client selection, and\naccordingly propose a novel framework of Federated learning scheme with\nDifferential parameter Dropout (FedDD). FedDD consists of two key modules:\ndropout rate allocation and uploaded parameter selection, which will optimize\nthe model parameter uploading ratios tailored to different clients'\nheterogeneous conditions and also select the proper set of important model\nparameters for uploading subject to clients' dropout rate constraints.\nSpecifically, the dropout rate allocation is formulated as a convex\noptimization problem, taking system heterogeneity, data heterogeneity, and\nmodel heterogeneity among clients into consideration. The uploaded parameter\nselection strategy prioritizes on eliciting important parameters for uploading\nto speedup convergence. Furthermore, we theoretically analyze the convergence\nof the proposed FedDD scheme. Extensive performance evaluations demonstrate\nthat the proposed FedDD scheme can achieve outstanding performances in both\ncommunication efficiency and model convergence, and also possesses a strong\ngeneralization capability to data of rare classes.\n","authors":["Zhiying Feng","Xu Chen","Qiong Wu","Wen Wu","Xiaoxi Zhang","Qianyi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.16835v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07136v4","updated":"2023-09-01T04:27:44Z","published":"2022-11-14T06:28:07Z","title":"C3: Cross-instance guided Contrastive Clustering","summary":" Clustering is the task of gathering similar data samples into clusters\nwithout using any predefined labels. It has been widely studied in machine\nlearning literature, and recent advancements in deep learning have revived\ninterest in this field. Contrastive clustering (CC) models are a staple of deep\nclustering in which positive and negative pairs of each data instance are\ngenerated through data augmentation. CC models aim to learn a feature space\nwhere instance-level and cluster-level representations of positive pairs are\ngrouped together. Despite improving the SOTA, these algorithms ignore the\ncross-instance patterns, which carry essential information for improving\nclustering performance. This increases the false-negative-pair rate of the\nmodel while decreasing its true-positive-pair rate. In this paper, we propose a\nnovel contrastive clustering method, Cross-instance guided Contrastive\nClustering (C3), that considers the cross-sample relationships to increase the\nnumber of positive pairs and mitigate the impact of false negative, noise, and\nanomaly sample on the learned representation of data. In particular, we define\na new loss function that identifies similar instances using the instance-level\nrepresentation and encourages them to aggregate together. Moreover, we propose\na novel weighting method to select negative samples in a more efficient way.\nExtensive experimental evaluations show that our proposed method can outperform\nstate-of-the-art algorithms on benchmark computer vision datasets: we improve\nthe clustering accuracy by 6.6%, 3.3%, 5.0%, 1.3% and 0.3% on CIFAR-10,\nCIFAR-100, ImageNet-10, ImageNet-Dogs, and Tiny-ImageNet.\n","authors":["Mohammadreza Sadeghi","Hadi Hojjati","Narges Armanfard"],"pdf_url":"https://arxiv.org/pdf/2211.07136v4.pdf","comment":"Accepted for publication at the 34th British Machine Vision\n Conference (BMVC-23)"},{"id":"http://arxiv.org/abs/2309.00244v1","updated":"2023-09-01T04:26:55Z","published":"2023-09-01T04:26:55Z","title":"NeuroSurgeon: A Toolkit for Subnetwork Analysis","summary":" Despite recent advances in the field of explainability, much remains unknown\nabout the algorithms that neural networks learn to represent. Recent work has\nattempted to understand trained models by decomposing them into functional\ncircuits (Csord\\'as et al., 2020; Lepori et al., 2023). To advance this\nresearch, we developed NeuroSurgeon, a python library that can be used to\ndiscover and manipulate subnetworks within models in the Huggingface\nTransformers library (Wolf et al., 2019). NeuroSurgeon is freely available at\nhttps://github.com/mlepori1/NeuroSurgeon.\n","authors":["Michael A. Lepori","Ellie Pavlick","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2309.00244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00236v1","updated":"2023-09-01T03:53:40Z","published":"2023-09-01T03:53:40Z","title":"Image Hijacking: Adversarial Images can Control Generative Models at\n Runtime","summary":" Are foundation models secure from malicious actors? In this work, we focus on\nthe image input to a vision-language model (VLM). We discover image hijacks,\nadversarial images that control generative models at runtime. We introduce\nBehavior Matching, a general method for creating image hijacks, and we use it\nto explore three types of attacks. Specific string attacks generate arbitrary\noutput of the adversary's choosing. Leak context attacks leak information from\nthe context window into the output. Jailbreak attacks circumvent a model's\nsafety training. We study these attacks against LLaVA-2, a state-of-the-art VLM\nbased on CLIP and LLaMA-2, and find that all our attack types have above a 90\\%\nsuccess rate. Moreover, our attacks are automated and require only small image\nperturbations. These findings raise serious concerns about the security of\nfoundation models. If image hijacks are as difficult to defend against as\nadversarial examples in CIFAR-10, then it might be many years before a solution\nis found -- if it even exists.\n","authors":["Luke Bailey","Euan Ong","Stuart Russell","Scott Emmons"],"pdf_url":"https://arxiv.org/pdf/2309.00236v1.pdf","comment":"Code is available at https://github.com/euanong/image-hijacks"},{"id":"http://arxiv.org/abs/2308.00890v2","updated":"2023-09-01T03:30:05Z","published":"2023-08-02T00:51:37Z","title":"Tango: rethinking quantization for graph neural network training on GPUs","summary":" Graph Neural Networks (GNNs) are becoming increasingly popular due to their\nsuperior performance in critical graph-related tasks. While quantization is\nwidely used to accelerate GNN computation, quantized training faces\nunprecedented challenges. Current quantized GNN training systems often have\nlonger training times than their full-precision counterparts for two reasons:\n(i) addressing the accuracy challenge leads to excessive overhead, and (ii) the\noptimization potential exposed by quantization is not adequately leveraged.\nThis paper introduces Tango which re-thinks quantization challenges and\nopportunities for graph neural network training on GPUs with three\ncontributions: Firstly, we introduce efficient rules to maintain accuracy\nduring quantized GNN training. Secondly, we design and implement\nquantization-aware primitives and inter-primitive optimizations that can speed\nup GNN training. Finally, we integrate Tango with the popular Deep Graph\nLibrary (DGL) system and demonstrate its superior performance over\nstate-of-the-art approaches on various GNN models and datasets.\n","authors":["Shiyang Chen","Da Zheng","Caiwen Ding","Chengying Huan","Yuede Ji","Hang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.00890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12865v3","updated":"2023-09-01T01:56:36Z","published":"2023-01-30T13:19:16Z","title":"SMDP-Based Dynamic Batching for Efficient Inference on GPU-Based\n Platforms","summary":" In up-to-date machine learning (ML) applications on cloud or edge computing\nplatforms, batching is an important technique for providing efficient and\neconomical services at scale. In particular, parallel computing resources on\nthe platforms, such as graphics processing units (GPUs), have higher\ncomputational and energy efficiency with larger batch sizes. However, larger\nbatch sizes may also result in longer response time, and thus it requires a\njudicious design. This paper aims to provide a dynamic batching policy that\nstrikes a balance between efficiency and latency. The GPU-based inference\nservice is modeled as a batch service queue with batch-size dependent\nprocessing time. Then, the design of dynamic batching is a continuous-time\naverage-cost problem, and is formulated as a semi-Markov decision process\n(SMDP) with the objective of minimizing the weighted sum of average response\ntime and average power consumption. The optimal policy is acquired by solving\nan associated discrete-time Markov decision process (MDP) problem with finite\nstate approximation and \"discretization\". By introducing an abstract cost to\nreflect the impact of \"tail\" states, the space complexity and the time\ncomplexity of the procedure can decrease by 63.5% and 98%, respectively. Our\nresults show that the optimal policies potentially possess a control limit\nstructure. Numerical results also show that SMDP-based batching policies can\nadapt to different traffic intensities and outperform other benchmark policies.\nFurthermore, the proposed solution has notable flexibility in balancing power\nconsumption and latency.\n","authors":["Yaodan Xu","Jingzhou Sun","Sheng Zhou","Zhisheng Niu"],"pdf_url":"https://arxiv.org/pdf/2301.12865v3.pdf","comment":"Accepted by 2023 IEEE International Conference on Communications\n (ICC)"},{"id":"http://arxiv.org/abs/2309.00203v1","updated":"2023-09-01T01:44:57Z","published":"2023-09-01T01:44:57Z","title":"Data-Driven Projection for Reducing Dimensionality of Linear Programs:\n Generalization Bound and Learning Methods","summary":" This paper studies a simple data-driven approach to high-dimensional linear\nprograms (LPs). Given data of past $n$-dimensional LPs, we learn an $n\\times k$\n\\textit{projection matrix} ($n > k$), which reduces the dimensionality from $n$\nto $k$. Then, we address future LP instances by solving $k$-dimensional LPs and\nrecovering $n$-dimensional solutions by multiplying the projection matrix. This\nidea is compatible with any user-preferred LP solvers, hence a versatile\napproach to faster LP solving. One natural question is: how much data is\nsufficient to ensure the recovered solutions' quality? We address this question\nbased on the idea of \\textit{data-driven algorithm design}, which relates the\namount of data sufficient for generalization guarantees to the\n\\textit{pseudo-dimension} of performance metrics. We present an\n$\\tilde{\\mathrm{O}}(nk^2)$ upper bound on the pseudo-dimension\n($\\tilde{\\mathrm{O}}$ compresses logarithmic factors) and complement it by an\n$\\Omega(nk)$ lower bound, hence tight up to an $\\tilde{\\mathrm{O}}(k)$ factor.\nOn the practical side, we study two natural methods for learning projection\nmatrices: PCA- and gradient-based methods. While the former is simple and\nefficient, the latter sometimes leads to better solution quality. Experiments\nconfirm that learned projection matrices are beneficial for reducing the time\nfor solving LPs while maintaining high solution quality.\n","authors":["Shinsaku Sakaue","Taihei Oki"],"pdf_url":"https://arxiv.org/pdf/2309.00203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00201v1","updated":"2023-09-01T01:40:58Z","published":"2023-09-01T01:40:58Z","title":"Subjectivity in Unsupervised Machine Learning Model Selection","summary":" Model selection is a necessary step in unsupervised machine learning. Despite\nnumerous criteria and metrics, model selection remains subjective. A high\ndegree of subjectivity may lead to questions about repeatability and\nreproducibility of various machine learning studies and doubts about the\nrobustness of models deployed in the real world. Yet, the impact of modelers'\npreferences on model selection outcomes remains largely unexplored. This study\nuses the Hidden Markov Model as an example to investigate the subjectivity\ninvolved in model selection. We asked 33 participants and three Large Language\nModels (LLMs) to make model selections in three scenarios. Results revealed\nvariability and inconsistencies in both the participants' and the LLMs'\nchoices, especially when different criteria and metrics disagree. Sources of\nsubjectivity include varying opinions on the importance of different criteria\nand metrics, differing views on how parsimonious a model should be, and how the\nsize of a dataset should influence model selection. The results underscore the\nimportance of developing a more standardized way to document subjective choices\nmade in model selection processes.\n","authors":["Wanyi Chen","Mary L. Cummings"],"pdf_url":"https://arxiv.org/pdf/2309.00201v1.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2309.00199v1","updated":"2023-09-01T01:40:39Z","published":"2023-09-01T01:40:39Z","title":"Diffusion Model with Clustering-based Conditioning for Food Image\n Generation","summary":" Image-based dietary assessment serves as an efficient and accurate solution\nfor recording and analyzing nutrition intake using eating occasion images as\ninput. Deep learning-based techniques are commonly used to perform image\nanalysis such as food classification, segmentation, and portion size\nestimation, which rely on large amounts of food images with annotations for\ntraining. However, such data dependency poses significant barriers to\nreal-world applications, because acquiring a substantial, diverse, and balanced\nset of food images can be challenging. One potential solution is to use\nsynthetic food images for data augmentation. Although existing work has\nexplored the use of generative adversarial networks (GAN) based structures for\ngeneration, the quality of synthetic food images still remains subpar. In\naddition, while diffusion-based generative models have shown promising results\nfor general image generation tasks, the generation of food images can be\nchallenging due to the substantial intra-class variance. In this paper, we\ninvestigate the generation of synthetic food images based on the conditional\ndiffusion model and propose an effective clustering-based training framework,\nnamed ClusDiff, for generating high-quality and representative food images. The\nproposed method is evaluated on the Food-101 dataset and shows improved\nperformance when compared with existing image generation works. We also\ndemonstrate that the synthetic food images generated by ClusDiff can help\naddress the severe class imbalance issue in long-tailed food classification\nusing the VFN-LT dataset.\n","authors":["Yue Han","Jiangpeng He","Mridul Gupta","Edward J. Delp","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.00199v1.pdf","comment":"Accepted for 31st ACM International Conference on Multimedia: 8th\n International Workshop on Multimedia Assisted Dietary Management (MADiMa\n 2023)"},{"id":"http://arxiv.org/abs/2309.00197v1","updated":"2023-09-01T01:23:28Z","published":"2023-09-01T01:23:28Z","title":"Deep-learning-based Early Fixing for Gas-lifted Oil Production\n Optimization: Supervised and Weakly-supervised Approaches","summary":" Maximizing oil production from gas-lifted oil wells entails solving\nMixed-Integer Linear Programs (MILPs). As the parameters of the wells, such as\nthe basic-sediment-to-water ratio and the gas-oil ratio, are updated, the\nproblems must be repeatedly solved. Instead of relying on costly exact methods\nor the accuracy of general approximate methods, in this paper, we propose a\ntailor-made heuristic solution based on deep learning models trained to provide\nvalues to all integer variables given varying well parameters, early-fixing the\ninteger variables and, thus, reducing the original problem to a linear program\n(LP). We propose two approaches for developing the learning-based heuristic: a\nsupervised learning approach, which requires the optimal integer values for\nseveral instances of the original problem in the training set, and a\nweakly-supervised learning approach, which requires only solutions for the\nearly-fixed linear problems with random assignments for the integer variables.\nOur results show a runtime reduction of 71.11% Furthermore, the\nweakly-supervised learning model provided significant values for early fixing,\ndespite never seeing the optimal values during training.\n","authors":["Bruno Machado Pacheco","Laio Oriel Seman","Eduardo Camponogara"],"pdf_url":"https://arxiv.org/pdf/2309.00197v1.pdf","comment":"Paper accepted at SBAI 2023"},{"id":"http://arxiv.org/abs/2207.02547v3","updated":"2023-09-01T01:23:17Z","published":"2022-07-06T10:01:46Z","title":"Simple and Efficient Heterogeneous Graph Neural Network","summary":" Heterogeneous graph neural networks (HGNNs) have powerful capability to embed\nrich structural and semantic information of a heterogeneous graph into node\nrepresentations. Existing HGNNs inherit many mechanisms from graph neural\nnetworks (GNNs) over homogeneous graphs, especially the attention mechanism and\nthe multi-layer structure. These mechanisms bring excessive complexity, but\nseldom work studies whether they are really effective on heterogeneous graphs.\nThis paper conducts an in-depth and detailed study of these mechanisms and\nproposes Simple and Efficient Heterogeneous Graph Neural Network (SeHGNN). To\neasily capture structural information, SeHGNN pre-computes the neighbor\naggregation using a light-weight mean aggregator, which reduces complexity by\nremoving overused neighbor attention and avoiding repeated neighbor aggregation\nin every training epoch. To better utilize semantic information, SeHGNN adopts\nthe single-layer structure with long metapaths to extend the receptive field,\nas well as a transformer-based semantic fusion module to fuse features from\ndifferent metapaths. As a result, SeHGNN exhibits the characteristics of simple\nnetwork structure, high prediction accuracy, and fast training speed. Extensive\nexperiments on five real-world heterogeneous graphs demonstrate the superiority\nof SeHGNN over the state-of-the-arts on both accuracy and training speed.\n","authors":["Xiaocheng Yang","Mingyu Yan","Shirui Pan","Xiaochun Ye","Dongrui Fan"],"pdf_url":"https://arxiv.org/pdf/2207.02547v3.pdf","comment":"Accepted by AAAI 2023"},{"id":"http://arxiv.org/abs/2308.15734v2","updated":"2023-09-01T01:10:06Z","published":"2023-08-30T03:21:45Z","title":"Efficient and Explainable Graph Neural Architecture Search via\n Monte-Carlo Tree Search","summary":" Graph neural networks (GNNs) are powerful tools for performing data science\ntasks in various domains. Although we use GNNs in wide application scenarios,\nit is a laborious task for researchers and practitioners to design/select\noptimal GNN architectures in diverse graphs. To save human efforts and\ncomputational costs, graph neural architecture search (Graph NAS) has been used\nto search for a sub-optimal GNN architecture that combines existing components.\nHowever, there are no existing Graph NAS methods that satisfy explainability,\nefficiency, and adaptability to various graphs. Therefore, we propose an\nefficient and explainable Graph NAS method, called ExGNAS, which consists of\n(i) a simple search space that can adapt to various graphs and (ii) a search\nalgorithm that makes the decision process explainable. The search space\nincludes only fundamental functions that can handle homophilic and heterophilic\ngraphs. The search algorithm efficiently searches for the best GNN architecture\nvia Monte-Carlo tree search without neural models. The combination of our\nsearch space and algorithm achieves finding accurate GNN models and the\nimportant functions within the search space. We comprehensively evaluate our\nmethod compared with twelve hand-crafted GNN architectures and three Graph NAS\nmethods in four graphs. Our experimental results show that ExGNAS increases AUC\nup to 3.6 and reduces run time up to 78\\% compared with the state-of-the-art\nGraph NAS methods. Furthermore, we show ExGNAS is effective in analyzing the\ndifference between GNN architectures in homophilic and heterophilic graphs.\n","authors":["Yuya Sasaki"],"pdf_url":"https://arxiv.org/pdf/2308.15734v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.13861v2","updated":"2023-09-01T00:05:04Z","published":"2021-11-27T10:22:29Z","title":"A New Multifractal-based Deep Learning Model for Text Mining","summary":" In this world full of uncertainty, where the fabric of existence weaves\npatterns of complexity, multifractal emerges as beacons of insight,\nilluminating them. As we delve into the realm of text mining that underpins\nvarious natural language processing applications and powers a range of\nintelligent services, we recognize that behind the veil of text lies a\nmanifestation of human thought and cognition, intricately intertwined with the\ncomplexities. Building upon the foundation of perceiving text as a complex\nsystem, this study embarks on a journey to unravel the hidden treasures within,\narmed with the proposed multifractal method that deciphers the multifractal\nattributes embedded within the text landscape. This endeavor culminates in the\nbirth of our novel model, which also harnesses the power of the proposed\nactivation function to facilitate nonlinear information transmission within its\nneural network architecture. The success on experiments anchored in real-world\ntechnical reports covering the extraction of technical term and classification\nof hazard events, stands as a testament to our endeavors. This research venture\nnot only expands our understanding of text mining but also opens new horizons\nfor knowledge discovery across various domains.\n","authors":["Zhenhua Wang","Ming Ren","Dong Gao"],"pdf_url":"https://arxiv.org/pdf/2111.13861v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.00615v1","updated":"2023-09-01T17:59:47Z","published":"2023-09-01T17:59:47Z","title":"Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D\n Understanding, Generation, and Instruction Following","summary":" We introduce Point-Bind, a 3D multi-modality model aligning point clouds with\n2D image, language, audio, and video. Guided by ImageBind, we construct a joint\nembedding space between 3D and multi-modalities, enabling many promising\napplications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D\nopen-world understanding. On top of this, we further present Point-LLM, the\nfirst 3D large language model (LLM) following 3D multi-modal instructions. By\nparameter-efficient fine-tuning techniques, Point-LLM injects the semantics of\nPoint-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction\ndata, but exhibits superior 3D and multi-modal question-answering capacity. We\nhope our work may cast a light on the community for extending 3D point clouds\nto multi-modality applications. Code is available at\nhttps://github.com/ZiyuGuo99/Point-Bind_Point-LLM.\n","authors":["Ziyu Guo","Renrui Zhang","Xiangyang Zhu","Yiwen Tang","Xianzheng Ma","Jiaming Han","Kexin Chen","Peng Gao","Xianzhi Li","Hongsheng Li","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2309.00615v1.pdf","comment":"Work in progress. Code is available at\n https://github.com/ZiyuGuo99/Point-Bind_Point-LLM"},{"id":"http://arxiv.org/abs/2309.00398v1","updated":"2023-09-01T11:14:43Z","published":"2023-09-01T11:14:43Z","title":"VideoGen: A Reference-Guided Latent Diffusion Approach for High\n Definition Text-to-Video Generation","summary":" In this paper, we present VideoGen, a text-to-video generation approach,\nwhich can generate a high-definition video with high frame fidelity and strong\ntemporal consistency using reference-guided latent diffusion. We leverage an\noff-the-shelf text-to-image generation model, e.g., Stable Diffusion, to\ngenerate an image with high content quality from the text prompt, as a\nreference image to guide video generation. Then, we introduce an efficient\ncascaded latent diffusion module conditioned on both the reference image and\nthe text prompt, for generating latent video representations, followed by a\nflow-based temporal upsampling step to improve the temporal resolution.\nFinally, we map latent video representations into a high-definition video\nthrough an enhanced video decoder. During training, we use the first frame of a\nground-truth video as the reference image for training the cascaded latent\ndiffusion module. The main characterises of our approach include: the reference\nimage generated by the text-to-image model improves the visual fidelity; using\nit as the condition makes the diffusion model focus more on learning the video\ndynamics; and the video decoder is trained over unlabeled video data, thus\nbenefiting from high-quality easily-available videos. VideoGen sets a new\nstate-of-the-art in text-to-video generation in terms of both qualitative and\nquantitative evaluation.\n","authors":["Xin Li","Wenqing Chu","Ye Wu","Weihang Yuan","Fanglong Liu","Qi Zhang","Fu Li","Haocheng Feng","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00398v1.pdf","comment":"8pages, 8figures"},{"id":"http://arxiv.org/abs/2309.00347v1","updated":"2023-09-01T09:08:21Z","published":"2023-09-01T09:08:21Z","title":"Towards Contrastive Learning in Music Video Domain","summary":" Contrastive learning is a powerful way of learning multimodal representations\nacross various domains such as image-caption retrieval and audio-visual\nrepresentation learning. In this work, we investigate if these findings\ngeneralize to the domain of music videos. Specifically, we create a dual\nen-coder for the audio and video modalities and train it using a bidirectional\ncontrastive loss. For the experiments, we use an industry dataset containing\n550 000 music videos as well as the public Million Song Dataset, and evaluate\nthe quality of learned representations on the downstream tasks of music tagging\nand genre classification. Our results indicate that pre-trained networks\nwithout contrastive fine-tuning outperform our contrastive learning approach\nwhen evaluated on both tasks. To gain a better understanding of the reasons\ncontrastive learning was not successful for music videos, we perform a\nqualitative analysis of the learned representations, revealing why contrastive\nlearning might have difficulties uniting embeddings from two modalities. Based\non these findings, we outline possible directions for future work. To\nfacilitate the reproducibility of our results, we share our code and the\npre-trained model.\n","authors":["Karel Veldkamp","Mariya Hendriksen","Zoltán Szlávik","Alexander Keijser"],"pdf_url":"https://arxiv.org/pdf/2309.00347v1.pdf","comment":"6 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2309.00216v1","updated":"2023-09-01T02:27:05Z","published":"2023-09-01T02:27:05Z","title":"Human-Inspired Facial Sketch Synthesis with Dynamic Adaptation","summary":" Facial sketch synthesis (FSS) aims to generate a vivid sketch portrait from a\ngiven facial photo. Existing FSS methods merely rely on 2D representations of\nfacial semantic or appearance. However, professional human artists usually use\noutlines or shadings to covey 3D geometry. Thus facial 3D geometry (e.g. depth\nmap) is extremely important for FSS. Besides, different artists may use diverse\ndrawing techniques and create multiple styles of sketches; but the style is\nglobally consistent in a sketch. Inspired by such observations, in this paper,\nwe propose a novel Human-Inspired Dynamic Adaptation (HIDA) method. Specially,\nwe propose to dynamically modulate neuron activations based on a joint\nconsideration of both facial 3D geometry and 2D appearance, as well as globally\nconsistent style control. Besides, we use deformable convolutions at\ncoarse-scales to align deep features, for generating abstract and distinct\noutlines. Experiments show that HIDA can generate high-quality sketches in\nmultiple styles, and significantly outperforms previous methods, over a large\nrange of challenging faces. Besides, HIDA allows precise style control of the\nsynthesized sketch, and generalizes well to natural scenes and other artistic\nstyles. Our code and results have been released online at:\nhttps://github.com/AiArt-HDU/HIDA.\n","authors":["Fei Gao","Yifan Zhu","Chang Jiang","Nannan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00216v1.pdf","comment":"To appear on ICCV'23"}]},"2023-09-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.02427v1","updated":"2023-09-05T17:56:20Z","published":"2023-09-05T17:56:20Z","title":"Cognitive Architectures for Language Agents","summary":" Recent efforts have incorporated large language models (LLMs) with external\nresources (e.g., the Internet) or internal control flows (e.g., prompt\nchaining) for tasks requiring grounding or reasoning. However, these efforts\nhave largely been piecemeal, lacking a systematic framework for constructing a\nfully-fledged language agent. To address this challenge, we draw on the rich\nhistory of agent design in symbolic artificial intelligence to develop a\nblueprint for a new wave of cognitive language agents. We first show that LLMs\nhave many of the same properties as production systems, and recent efforts to\nimprove their grounding or reasoning mirror the development of cognitive\narchitectures built around production systems. We then propose Cognitive\nArchitectures for Language Agents (CoALA), a conceptual framework to\nsystematize diverse methods for LLM-based reasoning, grounding, learning, and\ndecision making as instantiations of language agents in the framework. Finally,\nwe use the CoALA framework to highlight gaps and propose actionable directions\ntoward more capable language agents in the future.\n","authors":["Theodore Sumers","Shunyu Yao","Karthik Narasimhan","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2309.02427v1.pdf","comment":"16 pages of main content, 10 pages of references, 5 figures. Equal\n contribution among the first two authors, order decided by coin flip. A\n CoALA-based repo of recent work on language agents:\n https://github.com/ysymyth/awesome-language-agents"},{"id":"http://arxiv.org/abs/2308.16458v2","updated":"2023-09-05T17:51:16Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained language models like ChatGPT have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks. Moreover, in bioinformatics, generating\nfunctional programs poses additional notable challenges due to the amount of\ndomain knowledge, the need for complicated data operations, and intricate\nfunctional dependencies between the operations. Here, we present BioCoder, a\nbenchmark developed to evaluate existing pre-trained models in generating\nbioinformatics code. In relation to function-code generation, BioCoder covers\npotential package dependencies, class declarations, and global variables. It\nincorporates 1026 functions and 1243 methods in Python and Java from GitHub and\n253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing\nframework for evaluation, and we have applied it to evaluate many models\nincluding InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+,\nInstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes\nthe importance of domain knowledge, pragmatic code generation, and contextual\nunderstanding. Our dataset, benchmark, Docker images, and scripts required for\ntesting are all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02403v1","updated":"2023-09-05T17:33:59Z","published":"2023-09-05T17:33:59Z","title":"Substitution-based Semantic Change Detection using Contextual Embeddings","summary":" Measuring semantic change has thus far remained a task where methods using\ncontextual embeddings have struggled to improve upon simpler techniques relying\nonly on static word vectors. Moreover, many of the previously proposed\napproaches suffer from downsides related to scalability and ease of\ninterpretation. We present a simplified approach to measuring semantic change\nusing contextual embeddings, relying only on the most probable substitutes for\nmasked terms. Not only is this approach directly interpretable, it is also far\nmore efficient in terms of storage, achieves superior average performance\nacross the most frequently cited datasets for this task, and allows for more\nnuanced investigation of change than is possible with static word vectors.\n","authors":["Dallas Card"],"pdf_url":"https://arxiv.org/pdf/2309.02403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02373v1","updated":"2023-09-05T16:35:41Z","published":"2023-09-05T16:35:41Z","title":"nanoT5: A PyTorch Framework for Pre-training and Fine-tuning T5-style\n Models with Limited Resources","summary":" State-of-the-art language models like T5 have revolutionized the NLP\nlandscape, but their computational demands hinder a large portion of the\nresearch community. To address this challenge, we present nanoT5, a\nspecially-optimized PyTorch framework for efficient pre-training and\nfine-tuning of T5 models. Drawing on insights from optimizer differences and\nprioritizing efficiency, nanoT5 allows a T5-Base model to be pre-trained on a\nsingle GPU in just 16 hours, without any loss in performance. With the\nintroduction of this open-source framework, we hope to widen the accessibility\nto language modelling research and cater to the community's demand for more\nuser-friendly T5 (Encoder-Decoder) implementations. Our contributions,\nincluding configurations, codebase, software/hardware insights, and pre-trained\nmodels, are available to the public, aiming to strike a balance between\nresearch accessibility and resource constraints in NLP.\n","authors":["Piotr Nawrot"],"pdf_url":"https://arxiv.org/pdf/2309.02373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15049v3","updated":"2023-09-05T15:33:49Z","published":"2023-03-27T09:46:56Z","title":"InterviewBot: Real-Time End-to-End Dialogue System to Interview Students\n for College Admission","summary":" We present the InterviewBot that dynamically integrates conversation history\nand customized topics into a coherent embedding space to conduct 10 mins\nhybrid-domain (open and closed) conversations with foreign students applying to\nU.S. colleges for assessing their academic and cultural readiness. To build a\nneural-based end-to-end dialogue model, 7,361 audio recordings of\nhuman-to-human interviews are automatically transcribed, where 440 are manually\ncorrected for finetuning and evaluation. To overcome the input/output size\nlimit of a transformer-based encoder-decoder model, two new methods are\nproposed, context attention and topic storing, allowing the model to make\nrelevant and consistent interactions. Our final model is tested both\nstatistically by comparing its responses to the interview data and dynamically\nby inviting professional interviewers and various students to interact with it\nin real-time, finding it highly satisfactory in fluency and context awareness.\n","authors":["Zihao Wang","Nathan Keyes","Terry Crawford","Jinho D. Choi"],"pdf_url":"https://arxiv.org/pdf/2303.15049v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02311v1","updated":"2023-09-05T15:27:22Z","published":"2023-09-05T15:27:22Z","title":"Weigh Your Own Words: Improving Hate Speech Counter Narrative Generation\n via Attention Regularization","summary":" Recent computational approaches for combating online hate speech involve the\nautomatic generation of counter narratives by adapting Pretrained\nTransformer-based Language Models (PLMs) with human-curated data. This process,\nhowever, can produce in-domain overfitting, resulting in models generating\nacceptable narratives only for hatred similar to training data, with little\nportability to other targets or to real-world toxic language. This paper\nintroduces novel attention regularization methodologies to improve the\ngeneralization capabilities of PLMs for counter narratives generation.\nOverfitting to training-specific terms is then discouraged, resulting in more\ndiverse and richer narratives. We experiment with two attention-based\nregularization techniques on a benchmark English dataset. Regularized models\nproduce better counter narratives than state-of-the-art approaches in most\ncases, both in terms of automatic metrics and human evaluation, especially when\nhateful targets are not present in the training data. This work paves the way\nfor better and more flexible counter-speech generation models, a task for which\ndatasets are highly challenging to produce.\n","authors":["Helena Bonaldi","Giuseppe Attanasio","Debora Nozza","Marco Guerini"],"pdf_url":"https://arxiv.org/pdf/2309.02311v1.pdf","comment":"To appear at CS4OA workshop (INLG-SIGDial)"},{"id":"http://arxiv.org/abs/2308.16137v2","updated":"2023-09-05T15:09:11Z","published":"2023-08-30T16:47:51Z","title":"LM-Infinite: Simple On-the-Fly Length Generalization for Large Language\n Models","summary":" In recent years, there have been remarkable advancements in the performance\nof Transformer-based Large Language Models (LLMs) across various domains. As\nthese LLMs are deployed for increasingly complex tasks, they often face the\nneed to conduct longer reasoning processes or understand larger contexts. In\nthese situations, the length generalization failure of LLMs on long sequences\nbecomes more prominent. Most pre-training schemes truncate training sequences\nto a fixed length. LLMs often struggle to generate fluent and coherent texts,\nlet alone carry out downstream tasks, after longer contexts, even with relative\npositional encoding designed to cope with this problem. Common solutions such\nas finetuning on longer corpora often involve daunting hardware and time costs\nand require careful training process design. To more efficiently leverage the\ngeneration capacity of existing LLMs, we theoretically and empirically\ninvestigate the main out-of-distribution (OOD) factors contributing to this\nproblem. Inspired by this diagnosis, we propose a simple yet effective solution\nfor on-the-fly length generalization, LM-Infinite. It involves only a\n$\\Lambda$-shaped attention mask (to avoid excessive attended tokens) and a\ndistance limit (to avoid unseen distances) while requiring no parameter updates\nor learning. We find it applicable to a variety of LLMs using relative-position\nencoding methods. LM-Infinite is computationally efficient with $O(n)$ time and\nspace, and demonstrates consistent text generation fluency and quality to as\nlong as 32k tokens on ArXiv and OpenWebText2 datasets, with 2.72x decoding\nspeedup. On downstream tasks such as passkey retrieval, it continues to work on\ninputs much longer than training lengths where vanilla models fail immediately.\n","authors":["Chi Han","Qifan Wang","Wenhan Xiong","Yu Chen","Heng Ji","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16137v2.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.02285v1","updated":"2023-09-05T14:45:27Z","published":"2023-09-05T14:45:27Z","title":"PromptTTS 2: Describing and Generating Voices with Text Prompt","summary":" Speech conveys more information than just text, as the same word can be\nuttered in various voices to convey diverse information. Compared to\ntraditional text-to-speech (TTS) methods relying on speech prompts (reference\nspeech) for voice variability, using text prompts (descriptions) is more\nuser-friendly since speech prompts can be hard to find or may not exist at all.\nTTS approaches based on the text prompt face two challenges: 1) the one-to-many\nproblem, where not all details about voice variability can be described in the\ntext prompt, and 2) the limited availability of text prompt datasets, where\nvendors and large cost of data labeling are required to write text prompt for\nspeech. In this work, we introduce PromptTTS 2 to address these challenges with\na variation network to provide variability information of voice not captured by\ntext prompts, and a prompt generation pipeline to utilize the large language\nmodels (LLM) to compose high quality text prompts. Specifically, the variation\nnetwork predicts the representation extracted from the reference speech (which\ncontains full information about voice) based on the text prompt representation.\nFor the prompt generation pipeline, it generates text prompts for speech with a\nspeech understanding model to recognize voice attributes (e.g., gender, speed)\nfrom speech and a large language model to formulate text prompt based on the\nrecognition results. Experiments on a large-scale (44K hours) speech dataset\ndemonstrate that compared to the previous works, PromptTTS 2 generates voices\nmore consistent with text prompts and supports the sampling of diverse voice\nvariability, thereby offering users more choices on voice generation.\nAdditionally, the prompt generation pipeline produces high-quality prompts,\neliminating the large labeling cost. The demo page of PromptTTS 2 is available\nonline\\footnote{https://speechresearch.github.io/prompttts2}.\n","authors":["Yichong Leng","Zhifang Guo","Kai Shen","Xu Tan","Zeqian Ju","Yanqing Liu","Yufei Liu","Dongchao Yang","Leying Zhang","Kaitao Song","Lei He","Xiang-Yang Li","Sheng Zhao","Tao Qin","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2309.02285v1.pdf","comment":"Demo page: https://speechresearch.github.io/prompttts2"},{"id":"http://arxiv.org/abs/2309.02240v1","updated":"2023-09-05T13:47:25Z","published":"2023-09-05T13:47:25Z","title":"Dialog Action-Aware Transformer for Dialog Policy Learning","summary":" Recent works usually address Dialog policy learning DPL by training a\nreinforcement learning (RL) agent to determine the best dialog action. However,\nexisting works on deep RL require a large volume of agent-user interactions to\nachieve acceptable performance. In this paper, we propose to make full use of\nthe plain text knowledge from the pre-trained language model to accelerate the\nRL agent's learning speed. Specifically, we design a dialog action-aware\ntransformer encoder (DaTrans), which integrates a new fine-tuning procedure\nnamed masked last action task to encourage DaTrans to be dialog-aware and\ndistils action-specific features. Then, DaTrans is further optimized in an RL\nsetting with ongoing interactions and evolves through exploration in the dialog\naction space toward maximizing long-term accumulated rewards. The effectiveness\nand efficiency of the proposed model are demonstrated with both simulator\nevaluation and human evaluation.\n","authors":["Huimin Wang","Wai-Chung Kwan","Kam-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2309.02240v1.pdf","comment":"To be appeared in SIGdial 2023"},{"id":"http://arxiv.org/abs/2309.02233v1","updated":"2023-09-05T13:39:38Z","published":"2023-09-05T13:39:38Z","title":"Augmenting Black-box LLMs with Medical Textbooks for Clinical Question\n Answering","summary":" Large-scale language models (LLMs), such as ChatGPT, are capable of\ngenerating human-like responses for various downstream tasks, such as\ntask-oriented dialogues and question answering. However, applying LLMs to\nmedical domains remains challenging due to their inability to leverage\ndomain-specific knowledge. In this study, we present the Large-scale Language\nModels Augmented with Medical Textbooks (LLM-AMT), which integrates\nauthoritative medical textbooks as the cornerstone of its design, enhancing its\nproficiency in the specialized domain through plug-and-play modules, comprised\nof a Hybrid Textbook Retriever, supplemented by the Query Augmenter and the LLM\nReader. Experimental evaluation on three open-domain medical question-answering\ntasks reveals a substantial enhancement in both the professionalism and\naccuracy of the LLM responses when utilizing LLM-AMT, exhibiting an improvement\nranging from 11.4% to 13.2%. Despite being 100 times smaller, we found that\nmedical textbooks as the retrieval corpus serves as a more valuable external\nknowledge source than Wikipedia in the medical domain. Our experiments show\nthat textbook augmentation results in a performance improvement ranging from\n9.7% to 12.2% over Wikipedia augmentation.\n","authors":["Yubo Wang","Xueguang Ma","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2309.02233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00936v2","updated":"2023-09-05T13:36:27Z","published":"2023-06-01T17:39:40Z","title":"AMR4NLI: Interpretable and robust NLI measures from semantic graphs","summary":" The task of natural language inference (NLI) asks whether a given premise\n(expressed in NL) entails a given NL hypothesis. NLI benchmarks contain human\nratings of entailment, but the meaning relationships driving these ratings are\nnot formalized. Can the underlying sentence pair relationships be made more\nexplicit in an interpretable yet robust fashion? We compare semantic structures\nto represent premise and hypothesis, including sets of contextualized\nembeddings and semantic graphs (Abstract Meaning Representations), and measure\nwhether the hypothesis is a semantic substructure of the premise, utilizing\ninterpretable metrics. Our evaluation on three English benchmarks finds value\nin both contextualized embeddings and semantic graphs; moreover, they provide\ncomplementary signals, and can be leveraged together in a hybrid model.\n","authors":["Juri Opitz","Shira Wein","Julius Steen","Anette Frank","Nathan Schneider"],"pdf_url":"https://arxiv.org/pdf/2306.00936v2.pdf","comment":"International Conference on Computational Semantics (IWCS 2023); v2\n fixes an imprecise sentence below Eq. 5"},{"id":"http://arxiv.org/abs/2309.02189v1","updated":"2023-09-05T12:48:21Z","published":"2023-09-05T12:48:21Z","title":"Leveraging BERT Language Models for Multi-Lingual ESG Issue\n Identification","summary":" Environmental, Social, and Governance (ESG) has been used as a metric to\nmeasure the negative impacts and enhance positive outcomes of companies in\nareas such as the environment, society, and governance. Recently, investors\nhave increasingly recognized the significance of ESG criteria in their\ninvestment choices, leading businesses to integrate ESG principles into their\noperations and strategies. The Multi-Lingual ESG Issue Identification (ML-ESG)\nshared task encompasses the classification of news documents into 35 distinct\nESG issue labels. In this study, we explored multiple strategies harnessing\nBERT language models to achieve accurate classification of news documents\nacross these labels. Our analysis revealed that the RoBERTa classifier emerged\nas one of the most successful approaches, securing the second-place position\nfor the English test dataset, and sharing the fifth-place position for the\nFrench test dataset. Furthermore, our SVM-based binary model tailored for the\nChinese language exhibited exceptional performance, earning the second-place\nrank on the test dataset.\n","authors":["Elvys Linhares Pontes","Mohamed Benjannet","Lam Kim Ming"],"pdf_url":"https://arxiv.org/pdf/2309.02189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02188v1","updated":"2023-09-05T12:47:44Z","published":"2023-09-05T12:47:44Z","title":"Incorporating Dictionaries into a Neural Network Architecture to Extract\n COVID-19 Medical Concepts From Social Media","summary":" We investigate the potential benefit of incorporating dictionary information\ninto a neural network architecture for natural language processing. In\nparticular, we make use of this architecture to extract several concepts\nrelated to COVID-19 from an on-line medical forum. We use a sample from the\nforum to manually curate one dictionary for each concept. In addition, we use\nMetaMap, which is a tool for extracting biomedical concepts, to identify a\nsmall number of semantic concepts. For a supervised concept extraction task on\nthe forum data, our best model achieved a macro $F_1$ score of 90\\%. A major\ndifficulty in medical concept extraction is obtaining labelled data from which\nto build supervised models. We investigate the utility of our models to\ntransfer to data derived from a different source in two ways. First for\nproducing labels via weak learning and second to perform concept extraction.\nThe dataset we use in this case comprises COVID-19 related tweets and we\nachieve an $F_1$ score 81\\% for symptom concept extraction trained on weakly\nlabelled data. The utility of our dictionaries is compared with a COVID-19\nsymptom dictionary that was constructed directly from Twitter. Further\nexperiments that incorporate BERT and a COVID-19 version of BERTweet\ndemonstrate that the dictionaries provide a commensurate result. Our results\nshow that incorporating small domain dictionaries to deep learning models can\nimprove concept extraction tasks. Moreover, models built using dictionaries\ngeneralize well and are transferable to different datasets on a similar task.\n","authors":["Abul Hasan","Mark Levene","David Weston"],"pdf_url":"https://arxiv.org/pdf/2309.02188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02162v1","updated":"2023-09-05T11:59:31Z","published":"2023-09-05T11:59:31Z","title":"Advancing Text-to-GLOSS Neural Translation Using a Novel Hyper-parameter\n Optimization Technique","summary":" In this paper, we investigate the use of transformers for Neural Machine\nTranslation of text-to-GLOSS for Deaf and Hard-of-Hearing communication. Due to\nthe scarcity of available data and limited resources for text-to-GLOSS\ntranslation, we treat the problem as a low-resource language task. We use our\nnovel hyper-parameter exploration technique to explore a variety of\narchitectural parameters and build an optimal transformer-based architecture\nspecifically tailored for text-to-GLOSS translation. The study aims to improve\nthe accuracy and fluency of Neural Machine Translation generated GLOSS. This is\nachieved by examining various architectural parameters including layer count,\nattention heads, embedding dimension, dropout, and label smoothing to identify\nthe optimal architecture for improving text-to-GLOSS translation performance.\nThe experiments conducted on the PHOENIX14T dataset reveal that the optimal\ntransformer architecture outperforms previous work on the same dataset. The\nbest model reaches a ROUGE (Recall-Oriented Understudy for Gisting Evaluation)\nscore of 55.18% and a BLEU-1 (BiLingual Evaluation Understudy 1) score of\n63.6%, outperforming state-of-the-art results on the BLEU1 and ROUGE score by\n8.42 and 0.63 respectively.\n","authors":["Younes Ouargani","Noussaima El Khattabi"],"pdf_url":"https://arxiv.org/pdf/2309.02162v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.11773v2","updated":"2023-09-05T11:59:25Z","published":"2023-08-22T20:30:59Z","title":"Identifying depression-related topics in smartphone-collected\n free-response speech recordings using an automatic speech recognition system\n and a deep learning topic model","summary":" Language use has been shown to correlate with depression, but large-scale\nvalidation is needed. Traditional methods like clinic studies are expensive.\nSo, natural language processing has been employed on social media to predict\ndepression, but limitations remain-lack of validated labels, biased user\nsamples, and no context. Our study identified 29 topics in 3919\nsmartphone-collected speech recordings from 265 participants using the Whisper\ntool and BERTopic model. Six topics with a median PHQ-8 greater than or equal\nto 10 were regarded as risk topics for depression: No Expectations, Sleep,\nMental Therapy, Haircut, Studying, and Coursework. To elucidate the topic\nemergence and associations with depression, we compared behavioral (from\nwearables) and linguistic characteristics across identified topics. The\ncorrelation between topic shifts and changes in depression severity over time\nwas also investigated, indicating the importance of longitudinally monitoring\nlanguage use. We also tested the BERTopic model on a similar smaller dataset\n(356 speech recordings from 57 participants), obtaining some consistent\nresults. In summary, our findings demonstrate specific speech topics may\nindicate depression severity. The presented data-driven workflow provides a\npractical approach to collecting and analyzing large-scale speech data from\nreal-world settings for digital health research.\n","authors":["Yuezhou Zhang","Amos A Folarin","Judith Dineley","Pauline Conde","Valeria de Angel","Shaoxiong Sun","Yatharth Ranjan","Zulqarnain Rashid","Callum Stewart","Petroula Laiou","Heet Sankesara","Linglong Qian","Faith Matcham","Katie M White","Carolin Oetzmann","Femke Lamers","Sara Siddi","Sara Simblett","Björn W. Schuller","Srinivasan Vairavan","Til Wykes","Josep Maria Haro","Brenda WJH Penninx","Vaibhav A Narayan","Matthew Hotopf","Richard JB Dobson","Nicholas Cummins","RADAR-CNS consortium"],"pdf_url":"https://arxiv.org/pdf/2308.11773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02145v1","updated":"2023-09-05T11:34:21Z","published":"2023-09-05T11:34:21Z","title":"Bring the Noise: Introducing Noise Robustness to Pretrained Automatic\n Speech Recognition","summary":" In recent research, in the domain of speech processing, large End-to-End\n(E2E) systems for Automatic Speech Recognition (ASR) have reported\nstate-of-the-art performance on various benchmarks. These systems intrinsically\nlearn how to handle and remove noise conditions from speech. Previous research\nhas shown, that it is possible to extract the denoising capabilities of these\nmodels into a preprocessor network, which can be used as a frontend for\ndownstream ASR models. However, the proposed methods were limited to specific\nfully convolutional architectures. In this work, we propose a novel method to\nextract the denoising capabilities, that can be applied to any encoder-decoder\narchitecture. We propose the Cleancoder preprocessor architecture that extracts\nhidden activations from the Conformer ASR model and feeds them to a decoder to\npredict denoised spectrograms. We train our pre-processor on the Noisy Speech\nDatabase (NSD) to reconstruct denoised spectrograms from noisy inputs. Then, we\nevaluate our model as a frontend to a pretrained Conformer ASR model as well as\na frontend to train smaller Conformer ASR models from scratch. We show that the\nCleancoder is able to filter noise from speech and that it improves the total\nWord Error Rate (WER) of the downstream model in noisy conditions for both\napplications.\n","authors":["Patrick Eickhoff","Matthias Möller","Theresa Pekarek Rosin","Johannes Twiefel","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2309.02145v1.pdf","comment":"Submitted and accepted for ICANN 2023 (32nd International Conference\n on Artificial Neural Networks)"},{"id":"http://arxiv.org/abs/2309.02144v1","updated":"2023-09-05T11:32:48Z","published":"2023-09-05T11:32:48Z","title":"Making Large Language Models Better Reasoners with Alignment","summary":" Reasoning is a cognitive process of using evidence to reach a sound\nconclusion. The reasoning capability is essential for large language models\n(LLMs) to serve as the brain of the artificial general intelligence agent.\nRecent studies reveal that fine-tuning LLMs on data with the chain of thought\n(COT) reasoning process can significantly enhance their reasoning capabilities.\nHowever, we find that the fine-tuned LLMs suffer from an \\textit{Assessment\nMisalignment} problem, i.e., they frequently assign higher scores to subpar\nCOTs, leading to potential limitations in their reasoning abilities. To address\nthis problem, we introduce an \\textit{Alignment Fine-Tuning (AFT)} paradigm,\nwhich involves three steps: 1) fine-tuning LLMs with COT training data; 2)\ngenerating multiple COT responses for each question, and categorizing them into\npositive and negative ones based on whether they achieve the correct answer; 3)\ncalibrating the scores of positive and negative responses given by LLMs with a\nnovel constraint alignment loss. Specifically, the constraint alignment loss\nhas two objectives: a) Alignment, which guarantees that positive scores surpass\nnegative scores to encourage answers with high-quality COTs; b) Constraint,\nwhich keeps the negative scores confined to a reasonable range to prevent the\nmodel degradation. Beyond just the binary positive and negative feedback, the\nconstraint alignment loss can be seamlessly adapted to the ranking situations\nwhen ranking feedback is accessible. Furthermore, we also delve deeply into\nrecent ranking-based alignment methods, such as DPO, RRHF, and PRO, and\ndiscover that the constraint, which has been overlooked by these approaches, is\nalso crucial for their performance. Extensive experiments on four reasoning\nbenchmarks with both binary and ranking feedback demonstrate the effectiveness\nof AFT.\n","authors":["Peiyi Wang","Lei Li","Liang Chen","Feifan Song","Binghuai Lin","Yunbo Cao","Tianyu Liu","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2309.02144v1.pdf","comment":"Large Language Models; Reasoning; Alignment"},{"id":"http://arxiv.org/abs/2309.02133v1","updated":"2023-09-05T11:22:08Z","published":"2023-09-05T11:22:08Z","title":"Evaluating Methods for Ground-Truth-Free Foreign Accent Conversion","summary":" Foreign accent conversion (FAC) is a special application of voice conversion\n(VC) which aims to convert the accented speech of a non-native speaker to a\nnative-sounding speech with the same speaker identity. FAC is difficult since\nthe native speech from the desired non-native speaker to be used as the\ntraining target is impossible to collect. In this work, we evaluate three\nrecently proposed methods for ground-truth-free FAC, where all of them aim to\nharness the power of sequence-to-sequence (seq2seq) and non-parallel VC models\nto properly convert the accent and control the speaker identity. Our\nexperimental evaluation results show that no single method was significantly\nbetter than the others in all evaluation axes, which is in contrast to\nconclusions drawn in previous studies. We also explain the effectiveness of\nthese methods with the training input and output of the seq2seq model and\nexamine the design choice of the non-parallel VC model, and show that\nintelligibility measures such as word error rates do not correlate well with\nsubjective accentedness. Finally, our implementation is open-sourced to promote\nreproducible research and help future researchers improve upon the compared\nsystems.\n","authors":["Wen-Chin Huang","Tomoki Toda"],"pdf_url":"https://arxiv.org/pdf/2309.02133v1.pdf","comment":"Accepted to the 2023 Asia Pacific Signal and Information Processing\n Association Annual Summit and Conference (APSIPA ASC). Demo page:\n https://unilight.github.io/Publication-Demos/publications/fac-evaluate. Code:\n https://github.com/unilight/seq2seq-vc"},{"id":"http://arxiv.org/abs/2309.02110v1","updated":"2023-09-05T10:38:53Z","published":"2023-09-05T10:38:53Z","title":"Wordle: A Microcosm of Life. Luck, Skill, Cheating, Loyalty, and\n Influence!","summary":" Wordle is a popular, online word game offered by the New York Times\n(nytimes.com). Currently there are some 2 million players of the English\nversion worldwide. Players have 6 attempts to guess the daily word (target\nword) and after each attempt, the player receives color-coded information about\nthe correctness and position of each letter in the guess. After either a\nsuccessful completion of the puzzle or the final unsuccessful attempt, software\ncan assess the player's luck and skill using Information Theory and can display\ndata for the first, second, ..., sixth guesses of a random sample of all\nplayers. Recently, I discovered that the latter data is presented in a format\nthat can easily be copied and pasted into a spreadsheet. I compiled data on\nWordle players' first guesses from May 2023 - August 2023 and inferred some\ninteresting information about Wordle players. A) Every day, about 0.2-0.5% of\nplayers solve the puzzle in one attempt. Because the odds of guessing the one\nof 2,315 possible target words at random is 0.043%, this implies that 4,000 -\n10,000 players cheat by obtaining the target word outside of playing the game!\nB) At least 1/3 of the players have a favorite starting word, or cycle through\nseveral. And even though players should be aware that target words are never\nrepeated, most players appear to remain loyal to their starting word even after\nits appearance as a target word. C) On August 15, 2023, about 30,000 players\nabruptly changed their starting word, presumably based on a crossword puzzle\nclue! Wordle players can be influenced! This study goes beyond social media\npostings, surveys, and Google Trends to provide solid, quantitative evidence\nabout cheating in Wordle.\n","authors":["James P. Dilger"],"pdf_url":"https://arxiv.org/pdf/2309.02110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02106v1","updated":"2023-09-05T10:26:32Z","published":"2023-09-05T10:26:32Z","title":"Leveraging Label Information for Multimodal Emotion Recognition","summary":" Multimodal emotion recognition (MER) aims to detect the emotional status of a\ngiven expression by combining the speech and text information. Intuitively,\nlabel information should be capable of helping the model locate the salient\ntokens/frames relevant to the specific emotion, which finally facilitates the\nMER task. Inspired by this, we propose a novel approach for MER by leveraging\nlabel information. Specifically, we first obtain the representative label\nembeddings for both text and speech modalities, then learn the label-enhanced\ntext/speech representations for each utterance via label-token and label-frame\ninteractions. Finally, we devise a novel label-guided attentive fusion module\nto fuse the label-aware text and speech representations for emotion\nclassification. Extensive experiments were conducted on the public IEMOCAP\ndataset, and experimental results demonstrate that our proposed approach\noutperforms existing baselines and achieves new state-of-the-art performance.\n","authors":["Peiying Wang","Sunlu Zeng","Junqing Chen","Lu Fan","Meng Chen","Youzheng Wu","Xiaodong He"],"pdf_url":"https://arxiv.org/pdf/2309.02106v1.pdf","comment":"Accepted by Interspeech 2023"},{"id":"http://arxiv.org/abs/2309.02105v1","updated":"2023-09-05T10:26:02Z","published":"2023-09-05T10:26:02Z","title":"Improving Query-Focused Meeting Summarization with Query-Relevant\n Knowledge","summary":" Query-Focused Meeting Summarization (QFMS) aims to generate a summary of a\ngiven meeting transcript conditioned upon a query. The main challenges for QFMS\nare the long input text length and sparse query-relevant information in the\nmeeting transcript. In this paper, we propose a knowledge-enhanced two-stage\nframework called Knowledge-Aware Summarizer (KAS) to tackle the challenges. In\nthe first stage, we introduce knowledge-aware scores to improve the\nquery-relevant segment extraction. In the second stage, we incorporate\nquery-relevant knowledge in the summary generation. Experimental results on the\nQMSum dataset show that our approach achieves state-of-the-art performance.\nFurther analysis proves the competency of our methods in generating relevant\nand faithful summaries.\n","authors":["Tiezheng Yu","Ziwei Ji","Pascale Fung"],"pdf_url":"https://arxiv.org/pdf/2309.02105v1.pdf","comment":"AACL 2023 Findings"},{"id":"http://arxiv.org/abs/2309.02092v1","updated":"2023-09-05T09:56:29Z","published":"2023-09-05T09:56:29Z","title":"Bridging Emotion Role Labeling and Appraisal-based Emotion Analysis","summary":" The term emotion analysis in text subsumes various natural language\nprocessing tasks which have in common the goal to enable computers to\nunderstand emotions. Most popular is emotion classification in which one or\nmultiple emotions are assigned to a predefined textual unit. While such setting\nis appropriate to identify the reader's or author's emotion, emotion role\nlabeling adds the perspective of mentioned entities and extracts text spans\nthat correspond to the emotion cause. The underlying emotion theories agree on\none important point; that an emotion is caused by some internal or external\nevent and comprises several subcomponents, including the subjective feeling and\na cognitive evaluation. We therefore argue that emotions and events are related\nin two ways. (1) Emotions are events; and this perspective is the fundament in\nNLP for emotion role labeling. (2) Emotions are caused by events; a perspective\nthat is made explicit with research how to incorporate psychological appraisal\ntheories in NLP models to interpret events. These two research directions, role\nlabeling and (event-focused) emotion classification, have by and large been\ntackled separately. We contributed to both directions with the projects SEAT\n(Structured Multi-Domain Emotion Analysis from Text) and CEAT (Computational\nEvent Evaluation based on Appraisal Theories for Emotion Analysis), both funded\nby the German Research Foundation. In this paper, we consolidate the findings\nand point out open research questions.\n","authors":["Roman Klinger"],"pdf_url":"https://arxiv.org/pdf/2309.02092v1.pdf","comment":"under review for https://bigpictureworkshop.com/"},{"id":"http://arxiv.org/abs/2308.16469v2","updated":"2023-09-05T09:34:55Z","published":"2023-08-31T05:25:04Z","title":"Link Prediction for Wikipedia Articles as a Natural Language Inference\n Task","summary":" Link prediction task is vital to automatically understanding the structure of\nlarge knowledge bases. In this paper, we present our system to solve this task\nat the Data Science and Advanced Analytics 2023 Competition \"Efficient and\nEffective Link Prediction\" (DSAA-2023 Competition) with a corpus containing\n948,233 training and 238,265 for public testing. This paper introduces an\napproach to link prediction in Wikipedia articles by formulating it as a\nnatural language inference (NLI) task. Drawing inspiration from recent\nadvancements in natural language processing and understanding, we cast link\nprediction as an NLI task, wherein the presence of a link between two articles\nis treated as a premise, and the task is to determine whether this premise\nholds based on the information presented in the articles. We implemented our\nsystem based on the Sentence Pair Classification for Link Prediction for the\nWikipedia Articles task. Our system achieved 0.99996 Macro F1-score and 1.00000\nMacro F1-score for the public and private test sets, respectively. Our team\nUIT-NLP ranked 3rd in performance on the private test set, equal to the scores\nof the first and second places. Our code is publicly for research purposes.\n","authors":["Chau-Thang Phan","Quoc-Nam Nguyen","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.16469v2.pdf","comment":"Accepted at the 10th IEEE International Conference On Data Science\n And Advanced Analytics (DSAA 2023)"},{"id":"http://arxiv.org/abs/2309.02077v1","updated":"2023-09-05T09:24:48Z","published":"2023-09-05T09:24:48Z","title":"An Automatic Evaluation Framework for Multi-turn Medical Consultations\n Capabilities of Large Language Models","summary":" Large language models (LLMs) have achieved significant success in interacting\nwith human. However, recent studies have revealed that these models often\nsuffer from hallucinations, leading to overly confident but incorrect\njudgments. This limits their application in the medical domain, where tasks\nrequire the utmost accuracy. This paper introduces an automated evaluation\nframework that assesses the practical capabilities of LLMs as virtual doctors\nduring multi-turn consultations. Consultation tasks are designed to require\nLLMs to be aware of what they do not know, to inquire about missing medical\ninformation from patients, and to ultimately make diagnoses. To evaluate the\nperformance of LLMs for these tasks, a benchmark is proposed by reformulating\nmedical multiple-choice questions from the United States Medical Licensing\nExaminations (USMLE), and comprehensive evaluation metrics are developed and\nevaluated on three constructed test sets. A medical consultation training set\nis further constructed to improve the consultation ability of LLMs. The results\nof the experiments show that fine-tuning with the training set can alleviate\nhallucinations and improve LLMs' performance on the proposed benchmark.\nExtensive experiments and ablation studies are conducted to validate the\neffectiveness and robustness of the proposed framework.\n","authors":["Yusheng Liao","Yutong Meng","Hongcheng Liu","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2309.02077v1.pdf","comment":"10 pages, 9figures"},{"id":"http://arxiv.org/abs/2309.02045v1","updated":"2023-09-05T08:44:23Z","published":"2023-09-05T08:44:23Z","title":"Enhance Multi-domain Sentiment Analysis of Review Texts through\n Prompting Strategies","summary":" Large Language Models (LLMs) have made significant strides in both scientific\nresearch and practical applications. Existing studies have demonstrated the\nstate-of-the-art (SOTA) performance of LLMs in various natural language\nprocessing tasks. However, the question of how to further enhance LLMs'\nperformance in specific task using prompting strategies remains a pivotal\nconcern. This paper explores the enhancement of LLMs' performance in sentiment\nanalysis through the application of prompting strategies. We formulate the\nprocess of prompting for sentiment analysis tasks and introduce two novel\nstrategies tailored for sentiment analysis: RolePlaying (RP) prompting and\nChain-of-thought (CoT) prompting. Specifically, we also propose the RP-CoT\nprompting strategy which is a combination of RP prompting and CoT prompting. We\nconduct comparative experiments on three distinct domain datasets to evaluate\nthe effectiveness of the proposed sentiment analysis strategies. The results\ndemonstrate that the adoption of the proposed prompting strategies leads to a\nincreasing enhancement in sentiment analysis accuracy. Further, the CoT\nprompting strategy exhibits a notable impact on implicit sentiment analysis,\nwith the RP-CoT prompting strategy delivering the most superior performance\namong all strategies.\n","authors":["Yajing Wang","Zongwei Luo"],"pdf_url":"https://arxiv.org/pdf/2309.02045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00085v2","updated":"2023-09-05T05:45:30Z","published":"2023-07-28T01:52:16Z","title":"Reasoning before Responding: Integrating Commonsense-based Causality\n Explanation for Empathetic Response Generation","summary":" Recent approaches to empathetic response generation try to incorporate\ncommonsense knowledge or reasoning about the causes of emotions to better\nunderstand the user's experiences and feelings. However, these approaches\nmainly focus on understanding the causalities of context from the user's\nperspective, ignoring the system's perspective. In this paper, we propose a\ncommonsense-based causality explanation approach for diverse empathetic\nresponse generation that considers both the user's perspective (user's desires\nand reactions) and the system's perspective (system's intentions and\nreactions). We enhance ChatGPT's ability to reason for the system's perspective\nby integrating in-context learning with commonsense knowledge. Then, we\nintegrate the commonsense-based causality explanation with both ChatGPT and a\nT5-based model. Experimental evaluations demonstrate that our method\noutperforms other comparable methods on both automatic and human evaluations.\n","authors":["Yahui Fu","Koji Inoue","Chenhui Chu","Tatsuya Kawahara"],"pdf_url":"https://arxiv.org/pdf/2308.00085v2.pdf","comment":"Accepted by the 24th Meeting of the Special Interest Group on\n Discourse and Dialogue (SIGDIAL 2023)"},{"id":"http://arxiv.org/abs/2309.01953v1","updated":"2023-09-05T05:05:06Z","published":"2023-09-05T05:05:06Z","title":"Bilevel Scheduled Sampling for Dialogue Generation","summary":" Exposure bias poses a common challenge in numerous natural language\nprocessing tasks, particularly in the dialog generation. In response to this\nissue, researchers have devised various techniques, among which scheduled\nsampling has proven to be an effective method for mitigating exposure bias.\nHowever, the existing state-of-the-art scheduled sampling methods solely\nconsider the current sampling words' quality for threshold truncation sampling,\nwhich overlooks the importance of sentence-level information and the method of\nthreshold truncation warrants further discussion. In this paper, we propose a\nbilevel scheduled sampling model that takes the sentence-level information into\naccount and incorporates it with word-level quality. To enhance sampling\ndiversity and improve the model's adaptability, we propose a smooth function\nthat maps the combined result of sentence-level and word-level information to\nan appropriate range, and employ probabilistic sampling based on the mapped\nvalues instead of threshold truncation. Experiments conducted on the\nDailyDialog and PersonaChat datasets demonstrate the effectiveness of our\nproposed methods, which significantly alleviate the exposure bias problem and\noutperform state-of-the-art scheduled sampling methods.\n","authors":["Jiawen Liu","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2309.01953v1.pdf","comment":"13 pages, 4 figures, Natural Language Processing and Chinese\n Computing(NLPCC 2023) accepted"},{"id":"http://arxiv.org/abs/2309.01947v1","updated":"2023-09-05T04:47:55Z","published":"2023-09-05T04:47:55Z","title":"TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression\n For On-device ASR Models","summary":" Automatic Speech Recognition (ASR) models need to be optimized for specific\nhardware before they can be deployed on devices. This can be done by tuning the\nmodel's hyperparameters or exploring variations in its architecture.\nRe-training and re-validating models after making these changes can be a\nresource-intensive task. This paper presents TODM (Train Once Deploy Many), a\nnew approach to efficiently train many sizes of hardware-friendly on-device ASR\nmodels with comparable GPU-hours to that of a single training job. TODM\nleverages insights from prior work on Supernet, where Recurrent Neural Network\nTransducer (RNN-T) models share weights within a Supernet. It reduces layer\nsizes and widths of the Supernet to obtain subnetworks, making them smaller\nmodels suitable for all hardware types. We introduce a novel combination of\nthree techniques to improve the outcomes of the TODM Supernet: adaptive\ndropouts, an in-place Alpha-divergence knowledge distillation, and the use of\nScaledAdam optimizer. We validate our approach by comparing Supernet-trained\nversus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using\nLibriSpeech. Results demonstrate that our TODM Supernet either matches or\nsurpasses the performance of manually tuned models by up to a relative of 3%\nbetter in word error rate (WER), while efficiently keeping the cost of training\nmany models at a small constant.\n","authors":["Yuan Shangguan","Haichuan Yang","Danni Li","Chunyang Wu","Yassir Fathullah","Dilin Wang","Ayushi Dalmia","Raghuraman Krishnamoorthi","Ozlem Kalinli","Junteng Jia","Jay Mahadeokar","Xin Lei","Mike Seltzer","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.01947v1.pdf","comment":"Meta AI; Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.01940v1","updated":"2023-09-05T04:12:01Z","published":"2023-09-05T04:12:01Z","title":"CodeApex: A Bilingual Programming Evaluation Benchmark for Large\n Language Models","summary":" With the emergence of Large Language Models (LLMs), there has been a\nsignificant improvement in the programming capabilities of models, attracting\ngrowing attention from researchers. We propose CodeApex, a bilingual benchmark\ndataset focusing on the programming comprehension and code generation abilities\nof LLMs. CodeApex comprises three types of multiple-choice questions:\nconceptual understanding, commonsense reasoning, and multi-hop reasoning,\ndesigned to evaluate LLMs on programming comprehension tasks. Additionally,\nCodeApex utilizes algorithmic questions and corresponding test cases to assess\nthe code quality generated by LLMs. We evaluate 14 state-of-the-art LLMs,\nincluding both general-purpose and specialized models. GPT exhibits the best\nprogramming capabilities, achieving approximate accuracies of 50% and 56% on\nthe two tasks, respectively. There is still significant room for improvement in\nprogramming tasks. We hope that CodeApex can serve as a reference for\nevaluating the coding capabilities of LLMs, further promoting their development\nand growth. Datasets are released at\n\\url{https://github.com/APEXLAB/CodeApex.git}. CodeApex submission website is\n\\url{https://apex.sjtu.edu.cn/codeapex/}.\n","authors":["Lingyue Fu","Huacan Chai","Shuang Luo","Kounianhua Du","Weiming Zhang","Longteng Fan","Jiayi Lei","Renting Rui","Jianghao Lin","Yuchen Fang","Yifan Liu","Jingkuan Wang","Siyuan Qi","Kangning Zhang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2309.01940v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2308.02463v3","updated":"2023-09-05T03:11:11Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":" In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of dataset construction, model design, and thorough\nevaluation. Our contribution can be concluded as follows: (i), we construct a\nlarge-scale Medical Multi-modal Dataset, MedMD, which consists of 16M 2D and 3D\nmedical scans with high-quality text descriptions or reports across various\ndata formats, modalities, and tasks, covering over 5000 distinct diseases. To\nthe best of our knowledge, this is the first large-scale, high-quality, medical\nvisual-language dataset, with both 2D and 3D scans; (ii ), we propose an\narchitecture that enables visually conditioned generative pre-training, i.e.,\nallowing for integration of text input with 2D or 3D medical scans, and\ngenerate responses for diverse radiologic tasks. The model was initially\npre-trained on MedMD and subsequently fine-tuned on the domain-specific\ndataset, which is a radiologic cleaned version of MedMD, containing 3M\nradiologic visual-language pairs, termed as RadMD; (iii), we propose a new\nevaluation benchmark, RadBench, that comprises five tasks, including modality\nrecognition, disease diagnosis, visual question answering, report generation\nand rationale diagnosis, aiming to comprehensively assess the capability of\nfoundation models in handling practical clinical problems. We conduct both\nautomatic and human evaluation on RadBench, in both cases, RadFM significantly\noutperforms existing multi-modal foundation models. The codes, data, and model\ncheckpoint will all be made publicly available to promote further research and\ndevelopment in the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11148v2","updated":"2023-09-05T02:28:49Z","published":"2023-08-22T03:10:40Z","title":"LLaMA-Reviewer: Advancing Code Review Automation with Large Language\n Models through Parameter-Efficient Fine-Tuning","summary":" The automation of code review activities, a long-standing pursuit in software\nengineering, has been primarily addressed by numerous domain-specific\npre-trained models. Despite their success, these models frequently demand\nextensive resources for pre-training from scratch. In contrast, Large Language\nModels (LLMs) provide an intriguing alternative, given their remarkable\ncapabilities when supplemented with domain-specific knowledge. However, their\npotential for automating code review tasks remains largely unexplored.\n In response to this research gap, we present LLaMA-Reviewer, an innovative\nframework that leverages the capabilities of LLaMA, a popular LLM, in the realm\nof code review. Mindful of resource constraints, this framework employs\nparameter-efficient fine-tuning (PEFT) methods, delivering high performance\nwhile using less than 1% of trainable parameters.\n An extensive evaluation of LLaMA-Reviewer is conducted on two diverse,\npublicly available datasets. Notably, even with the smallest LLaMA base model\nconsisting of 6.7B parameters and a limited number of tuning epochs,\nLLaMA-Reviewer equals the performance of existing code-review-focused models.\n The ablation experiments provide insights into the influence of various\nfine-tuning process components, including input representation, instruction\ntuning, and different PEFT methods. To foster continuous progress in this\nfield, the code and all PEFT-weight plugins have been made open-source.\n","authors":["Junyi Lu","Lei Yu","Xiaojia Li","Li Yang","Chun Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.11148v2.pdf","comment":"Accepted to the 34th IEEE International Symposium on Software\n Reliability Engineering (ISSRE 2023)"},{"id":"http://arxiv.org/abs/2309.01885v1","updated":"2023-09-05T01:39:09Z","published":"2023-09-05T01:39:09Z","title":"QuantEase: Optimization-based Quantization for Language Models -- An\n Efficient and Intuitive Algorithm","summary":" With the rising popularity of Large Language Models (LLMs), there has been an\nincreasing interest in compression techniques that enable their efficient\ndeployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs.\nDrawing from recent advances, our work introduces QuantEase, a layer-wise\nquantization framework where individual layers undergo separate quantization.\nThe problem is framed as a discrete-structured non-convex optimization,\nprompting the development of algorithms rooted in Coordinate Descent (CD)\ntechniques. These CD-based methods provide high-quality solutions to the\ncomplex non-convex layer-wise quantization problems. Notably, our CD-based\napproach features straightforward updates, relying solely on matrix and vector\noperations, circumventing the need for matrix inversion or decomposition. We\nalso explore an outlier-aware variant of our approach, allowing for retaining\nsignificant weights (outliers) with complete precision. Our proposal attains\nstate-of-the-art performance in terms of perplexity and zero-shot accuracy in\nempirical evaluations across various LLMs and datasets, with relative\nimprovements up to 15% over methods such as GPTQ. Particularly noteworthy is\nour outlier-aware algorithm's capability to achieve near or sub-3-bit\nquantization of LLMs with an acceptable drop in accuracy, obviating the need\nfor non-uniform quantization or grouping techniques, improving upon methods\nsuch as SpQR by up to two times in terms of perplexity.\n","authors":["Kayhan Behdin","Ayan Acharya","Aman Gupta","Sathiya Keerthi","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2309.01885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15906v3","updated":"2023-09-05T01:01:58Z","published":"2023-08-30T09:19:06Z","title":"Is the U.S. Legal System Ready for AI's Challenges to Human Values?","summary":" Our interdisciplinary study investigates how effectively U.S. laws confront\nthe challenges posed by Generative AI to human values. Through an analysis of\ndiverse hypothetical scenarios crafted during an expert workshop, we have\nidentified notable gaps and uncertainties within the existing legal framework\nregarding the protection of fundamental values, such as privacy, autonomy,\ndignity, diversity, equity, and physical/mental well-being. Constitutional and\ncivil rights, it appears, may not provide sufficient protection against\nAI-generated discriminatory outputs. Furthermore, even if we exclude the\nliability shield provided by Section 230, proving causation for defamation and\nproduct liability claims is a challenging endeavor due to the intricate and\nopaque nature of AI systems. To address the unique and unforeseeable threats\nposed by Generative AI, we advocate for legal frameworks that evolve to\nrecognize new threats and provide proactive, auditable guidelines to industry\nstakeholders. Addressing these issues requires deep interdisciplinary\ncollaborations to identify harms, values, and mitigation strategies.\n","authors":["Inyoung Cheong","Aylin Caliskan","Tadayoshi Kohno"],"pdf_url":"https://arxiv.org/pdf/2308.15906v3.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.01868v1","updated":"2023-09-05T00:19:31Z","published":"2023-09-05T00:19:31Z","title":"On the Planning, Search, and Memorization Capabilities of Large Language\n Models","summary":" The rapid advancement of large language models, such as the Generative\nPre-trained Transformer (GPT) series, has had significant implications across\nvarious disciplines. In this study, we investigate the potential of the\nstate-of-the-art large language model (GPT-4) for planning tasks. We explore\nits effectiveness in multiple planning subfields, highlighting both its\nstrengths and limitations. Through a comprehensive examination, we identify\nareas where large language models excel in solving planning problems and reveal\nthe constraints that limit their applicability. Our empirical analysis focuses\non GPT-4's performance in planning domain extraction, graph search path\nplanning, and adversarial planning. We then propose a way of fine-tuning a\ndomain-specific large language model to improve its Chain of Thought (CoT)\ncapabilities for the above-mentioned tasks. The results provide valuable\ninsights into the potential applications of large language models in the\nplanning domain and pave the way for future research to overcome their\nlimitations and expand their capabilities.\n","authors":["Yunhao Yang","Anshul Tomar"],"pdf_url":"https://arxiv.org/pdf/2309.01868v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.00384v2","updated":"2023-09-05T23:03:12Z","published":"2023-09-01T10:44:36Z","title":"BatchPrompt: Accomplish more with less","summary":" As the ever-increasing token limits of large language models (LLMs) have\nenabled long context as input, prompting with single data samples might no\nlonger an efficient way. A straightforward strategy improving efficiency is to\nbatch data within the token limit (e.g., 8k for gpt-3.5-turbo; 32k for GPT-4),\nwhich we call BatchPrompt. We have two initial observations for prompting with\nbatched data. First, we find that prompting with batched data in longer\ncontexts will inevitably lead to worse performance, compared to single-data\nprompting. Second, the performance of the language model is significantly\ncorrelated with the positions and order of the batched data, due to the\ncorresponding change in decoder context. To retain efficiency and overcome\nperformance loss, we propose Batch Permutation and Ensembling (BPE), and a\nnovel Self-reflection-guided EArly Stopping (SEAS) technique. Our comprehensive\nexperimental evaluation demonstrates that BPE can boost the performance of\nBatchPrompt with a striking margin on a range of popular NLP tasks, including\nquestion answering (Boolq), textual entailment (RTE), and duplicate questions\nidentification (QQP). These performances are even competitive with/higher than\nsingle-data prompting(SinglePrompt), while BatchPrompt requires much fewer LLM\ncalls and input tokens (For SinglePrompt v.s. BatchPrompt with batch size 32,\nusing just 9%-16% the number of LLM calls, Boolq accuracy 90.6% to 90.9% with\n27.4% tokens, QQP accuracy 87.2% to 88.4% with 18.6% tokens, RTE accuracy 91.5%\nto 91.1% with 30.8% tokens). To the best of our knowledge, this is the first\nwork to technically improve prompting efficiency of large language models. We\nhope our simple yet effective approach will shed light on the future research\nof large language models. The code will be released.\n","authors":["Jianzhe Lin","Maurice Diesendruck","Liang Du","Robin Abraham"],"pdf_url":"https://arxiv.org/pdf/2309.00384v2.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.02591v1","updated":"2023-09-05T21:27:27Z","published":"2023-09-05T21:27:27Z","title":"Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction\n Tuning","summary":" We present CM3Leon (pronounced \"Chameleon\"), a retrieval-augmented,\ntoken-based, decoder-only multi-modal language model capable of generating and\ninfilling both text and images. CM3Leon uses the CM3 multi-modal architecture\nbut additionally shows the extreme benefits of scaling up and tuning on more\ndiverse instruction-style data. It is the first multi-modal model trained with\na recipe adapted from text-only language models, including a large-scale\nretrieval-augmented pre-training stage and a second multi-task supervised\nfine-tuning (SFT) stage. It is also a general-purpose model that can do both\ntext-to-image and image-to-text generation, allowing us to introduce\nself-contained contrastive decoding methods that produce high-quality outputs.\nExtensive experiments demonstrate that this recipe is highly effective for\nmulti-modal models. CM3Leon achieves state-of-the-art performance in\ntext-to-image generation with 5x less training compute than comparable methods\n(zero-shot MS-COCO FID of 4.88). After SFT, CM3Leon can also demonstrate\nunprecedented levels of controllability in tasks ranging from language-guided\nimage editing to image-controlled generation and segmentation.\n","authors":["Lili Yu","Bowen Shi","Ramakanth Pasunuru","Benjamin Muller","Olga Golovneva","Tianlu Wang","Arun Babu","Binh Tang","Brian Karrer","Shelly Sheynin","Candace Ross","Adam Polyak","Russell Howes","Vasu Sharma","Puxin Xu","Hovhannes Tamoyan","Oron Ashual","Uriel Singer","Shang-Wen Li","Susan Zhang","Richard James","Gargi Ghosh","Yaniv Taigman","Maryam Fazel-Zarandi","Asli Celikyilmaz","Luke Zettlemoyer","Armen Aghajanyan"],"pdf_url":"https://arxiv.org/pdf/2309.02591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00301v2","updated":"2023-09-05T20:55:09Z","published":"2022-12-01T06:14:57Z","title":"Learning to Select from Multiple Options","summary":" Many NLP tasks can be regarded as a selection problem from a set of options,\nsuch as classification tasks, multi-choice question answering, etc. Textual\nentailment (TE) has been shown as the state-of-the-art (SOTA) approach to\ndealing with those selection problems. TE treats input texts as premises (P),\noptions as hypotheses (H), then handles the selection problem by modeling (P,\nH) pairwise. Two limitations: first, the pairwise modeling is unaware of other\noptions, which is less intuitive since humans often determine the best options\nby comparing competing candidates; second, the inference process of pairwise TE\nis time-consuming, especially when the option space is large. To deal with the\ntwo issues, this work first proposes a contextualized TE model (Context-TE) by\nappending other k options as the context of the current (P, H) modeling.\nContext-TE is able to learn more reliable decision for the H since it considers\nvarious context. Second, we speed up Context-TE by coming up with Parallel-TE,\nwhich learns the decisions of multiple options simultaneously. Parallel-TE\nsignificantly improves the inference speed while keeping comparable performance\nwith Context-TE. Our methods are evaluated on three tasks (ultra-fine entity\ntyping, intent detection and multi-choice QA) that are typical selection\nproblems with different sizes of options. Experiments show our models set new\nSOTA performance; particularly, Parallel-TE is faster than the pairwise TE by k\ntimes in inference. Our code is publicly available at\nhttps://github.com/jiangshdd/LearningToSelect.\n","authors":["Jiangshu Du","Wenpeng Yin","Congying Xia","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2212.00301v2.pdf","comment":"Accepted by AAAI 2023"},{"id":"http://arxiv.org/abs/2306.08133v2","updated":"2023-09-05T20:50:24Z","published":"2023-06-13T20:54:12Z","title":"Large-scale Language Model Rescoring on Long-form Data","summary":" In this work, we study the impact of Large-scale Language Models (LLM) on\nAutomated Speech Recognition (ASR) of YouTube videos, which we use as a source\nfor long-form ASR. We demonstrate up to 8\\% relative reduction in Word Error\nEate (WER) on US English (en-us) and code-switched Indian English (en-in)\nlong-form ASR test sets and a reduction of up to 30\\% relative on Salient Term\nError Rate (STER) over a strong first-pass baseline that uses a maximum-entropy\nbased language model. Improved lattice processing that results in a lattice\nwith a proper (non-tree) digraph topology and carrying context from the 1-best\nhypothesis of the previous segment(s) results in significant wins in rescoring\nwith LLMs. We also find that the gains in performance from the combination of\nLLMs trained on vast quantities of available data (such as C4) and conventional\nneural LMs is additive and significantly outperforms a strong first-pass\nbaseline with a maximum entropy LM.\n Copyright 2023 IEEE. Personal use of this material is permitted. Permission\nfrom IEEE must be obtained for all other uses, in any current or future media,\nincluding reprinting/republishing this material for advertising or promotional\npurposes, creating new collective works, for resale or redistribution to\nservers or lists, or reuse of any copyrighted component of this work in other\nworks.\n","authors":["Tongzhou Chen","Cyril Allauzen","Yinghui Huang","Daniel Park","David Rybach","W. Ronny Huang","Rodrigo Cabrera","Kartik Audhkhasi","Bhuvana Ramabhadran","Pedro J. Moreno","Michael Riley"],"pdf_url":"https://arxiv.org/pdf/2306.08133v2.pdf","comment":"5 pages, accepted in ICASSP 2023"},{"id":"http://arxiv.org/abs/2309.02553v1","updated":"2023-09-05T19:40:45Z","published":"2023-09-05T19:40:45Z","title":"Automating Behavioral Testing in Machine Translation","summary":" Behavioral testing in NLP allows fine-grained evaluation of systems by\nexamining their linguistic capabilities through the analysis of input-output\nbehavior. Unfortunately, existing work on behavioral testing in Machine\nTranslation (MT) is currently restricted to largely handcrafted tests covering\na limited range of capabilities and languages. To address this limitation, we\npropose to use Large Language Models (LLMs) to generate a diverse set of source\nsentences tailored to test the behavior of MT models in a range of situations.\nWe can then verify whether the MT model exhibits the expected behavior through\nmatching candidate sets that are also generated using LLMs. Our approach aims\nto make behavioral testing of MT systems practical while requiring only minimal\nhuman effort. In our experiments, we apply our proposed evaluation framework to\nassess multiple available MT systems, revealing that while in general\npass-rates follow the trends observable from traditional accuracy-based\nmetrics, our method was able to uncover several important differences and\npotential bugs that go unnoticed when relying only on accuracy.\n","authors":["Javier Ferrando","Matthias Sperber","Hendra Setiawan","Dominic Telaar","Saša Hasan"],"pdf_url":"https://arxiv.org/pdf/2309.02553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05382v3","updated":"2023-09-05T18:13:24Z","published":"2023-03-06T16:36:17Z","title":"ChatGPT is on the Horizon: Could a Large Language Model be Suitable for\n Intelligent Traffic Safety Research and Applications?","summary":" ChatGPT embarks on a new era of artificial intelligence and will\nrevolutionize the way we approach intelligent traffic safety systems. This\npaper begins with a brief introduction about the development of large language\nmodels (LLMs). Next, we exemplify using ChatGPT to address key traffic safety\nissues. Furthermore, we discuss the controversies surrounding LLMs, raise\ncritical questions for their deployment, and provide our solutions. Moreover,\nwe propose an idea of multi-modality representation learning for smarter\ntraffic safety decision-making and open more questions for application\nimprovement. We believe that LLM will both shape and potentially facilitate\ncomponents of traffic safety research.\n","authors":["Ou Zheng","Mohamed Abdel-Aty","Dongdong Wang","Zijin Wang","Shengxuan Ding"],"pdf_url":"https://arxiv.org/pdf/2303.05382v3.pdf","comment":"Submitted to Nature - Machine Intelligence (Revised and Extended)"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.02436v1","updated":"2023-09-05T17:59:58Z","published":"2023-09-05T17:59:58Z","title":"GO-SLAM: Global Optimization for Consistent 3D Instant Reconstruction","summary":" Neural implicit representations have recently demonstrated compelling results\non dense Simultaneous Localization And Mapping (SLAM) but suffer from the\naccumulation of errors in camera tracking and distortion in the reconstruction.\nPurposely, we present GO-SLAM, a deep-learning-based dense visual SLAM\nframework globally optimizing poses and 3D reconstruction in real-time. Robust\npose estimation is at its core, supported by efficient loop closing and online\nfull bundle adjustment, which optimize per frame by utilizing the learned\nglobal geometry of the complete history of input frames. Simultaneously, we\nupdate the implicit and continuous surface representation on-the-fly to ensure\nglobal consistency of 3D reconstruction. Results on various synthetic and\nreal-world datasets demonstrate that GO-SLAM outperforms state-of-the-art\napproaches at tracking robustness and reconstruction accuracy. Furthermore,\nGO-SLAM is versatile and can run with monocular, stereo, and RGB-D input.\n","authors":["Youmin Zhang","Fabio Tosi","Stefano Mattoccia","Matteo Poggi"],"pdf_url":"https://arxiv.org/pdf/2309.02436v1.pdf","comment":"ICCV 2023. Code: https://github.com/youmi-zym/GO-SLAM - Project Page:\n https://youmi-zym.github.io/projects/GO-SLAM/"},{"id":"http://arxiv.org/abs/2309.02435v1","updated":"2023-09-05T17:59:45Z","published":"2023-09-05T17:59:45Z","title":"Efficient RL via Disentangled Environment and Agent Representations","summary":" Agents that are aware of the separation between themselves and their\nenvironments can leverage this understanding to form effective representations\nof visual input. We propose an approach for learning such structured\nrepresentations for RL algorithms, using visual knowledge of the agent, such as\nits shape or mask, which is often inexpensive to obtain. This is incorporated\ninto the RL objective using a simple auxiliary loss. We show that our method,\nStructured Environment-Agent Representations, outperforms state-of-the-art\nmodel-free approaches over 18 different challenging visual simulation\nenvironments spanning 5 different robots. Website at https://sear-rl.github.io/\n","authors":["Kevin Gmelin","Shikhar Bahl","Russell Mendonca","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2309.02435v1.pdf","comment":"ICML 2023. Website at https://sear-rl.github.io/"},{"id":"http://arxiv.org/abs/2309.02434v1","updated":"2023-09-05T17:59:42Z","published":"2023-09-05T17:59:42Z","title":"ReliTalk: Relightable Talking Portrait Generation from a Single Video","summary":" Recent years have witnessed great progress in creating vivid audio-driven\nportraits from monocular videos. However, how to seamlessly adapt the created\nvideo avatars to other scenarios with different backgrounds and lighting\nconditions remains unsolved. On the other hand, existing relighting studies\nmostly rely on dynamically lighted or multi-view data, which are too expensive\nfor creating video portraits. To bridge this gap, we propose ReliTalk, a novel\nframework for relightable audio-driven talking portrait generation from\nmonocular videos. Our key insight is to decompose the portrait's reflectance\nfrom implicitly learned audio-driven facial normals and images. Specifically,\nwe involve 3D facial priors derived from audio features to predict delicate\nnormal maps through implicit functions. These initially predicted normals then\ntake a crucial part in reflectance decomposition by dynamically estimating the\nlighting condition of the given video. Moreover, the stereoscopic face\nrepresentation is refined using the identity-consistent loss under simulated\nmultiple lighting conditions, addressing the ill-posed problem caused by\nlimited views available from a single monocular video. Extensive experiments\nvalidate the superiority of our proposed framework on both real and synthetic\ndatasets. Our code is released in https://github.com/arthur-qiu/ReliTalk.\n","authors":["Haonan Qiu","Zhaoxi Chen","Yuming Jiang","Hang Zhou","Xiangyu Fan","Lei Yang","Wayne Wu","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02429v1","updated":"2023-09-05T17:57:31Z","published":"2023-09-05T17:57:31Z","title":"Building a Winning Team: Selecting Source Model Ensembles using a\n Submodular Transferability Estimation Approach","summary":" Estimating the transferability of publicly available pretrained models to a\ntarget task has assumed an important place for transfer learning tasks in\nrecent years. Existing efforts propose metrics that allow a user to choose one\nmodel from a pool of pre-trained models without having to fine-tune each model\nindividually and identify one explicitly. With the growth in the number of\navailable pre-trained models and the popularity of model ensembles, it also\nbecomes essential to study the transferability of multiple-source models for a\ngiven target task. The few existing efforts study transferability in such\nmulti-source ensemble settings using just the outputs of the classification\nlayer and neglect possible domain or task mismatch. Moreover, they overlook the\nmost important factor while selecting the source models, viz., the cohesiveness\nfactor between them, which can impact the performance and confidence in the\nprediction of the ensemble. To address these gaps, we propose a novel Optimal\ntranSport-based suBmOdular tRaNsferability metric (OSBORN) to estimate the\ntransferability of an ensemble of models to a downstream task. OSBORN\ncollectively accounts for image domain difference, task difference, and\ncohesiveness of models in the ensemble to provide reliable estimates of\ntransferability. We gauge the performance of OSBORN on both image\nclassification and semantic segmentation tasks. Our setup includes 28 source\ndatasets, 11 target datasets, 5 model architectures, and 2 pre-training\nmethods. We benchmark our method against current state-of-the-art metrics\nMS-LEEP and E-LEEP, and outperform them consistently using the proposed\napproach.\n","authors":["Vimal K B","Saketh Bachu","Tanmay Garg","Niveditha Lakshmi Narasimhan","Raghavan Konuru","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2309.02429v1.pdf","comment":"To appear at ICCV 2023"},{"id":"http://arxiv.org/abs/2302.03023v4","updated":"2023-09-05T17:56:42Z","published":"2023-02-06T18:58:38Z","title":"V1T: large-scale mouse V1 response prediction using a Vision Transformer","summary":" Accurate predictive models of the visual cortex neural response to natural\nvisual stimuli remain a challenge in computational neuroscience. In this work,\nwe introduce V1T, a novel Vision Transformer based architecture that learns a\nshared visual and behavioral representation across animals. We evaluate our\nmodel on two large datasets recorded from mouse primary visual cortex and\noutperform previous convolution-based models by more than 12.7% in prediction\nperformance. Moreover, we show that the self-attention weights learned by the\nTransformer correlate with the population receptive fields. Our model thus sets\na new benchmark for neural response prediction and can be used jointly with\nbehavioral and neural recordings to reveal meaningful characteristic features\nof the visual cortex.\n","authors":["Bryan M. Li","Isabel M. Cornacchia","Nathalie L. Rochefort","Arno Onken"],"pdf_url":"https://arxiv.org/pdf/2302.03023v4.pdf","comment":"updated references and added link to code repository; add analysis on\n generalization and visualize aRFs; updated with TMLR publication"},{"id":"http://arxiv.org/abs/2206.08903v3","updated":"2023-09-05T17:51:32Z","published":"2022-06-17T17:23:50Z","title":"Colonoscopy 3D Video Dataset with Paired Depth from 2D-3D Registration","summary":" Screening colonoscopy is an important clinical application for several 3D\ncomputer vision techniques, including depth estimation, surface reconstruction,\nand missing region detection. However, the development, evaluation, and\ncomparison of these techniques in real colonoscopy videos remain largely\nqualitative due to the difficulty of acquiring ground truth data. In this work,\nwe present a Colonoscopy 3D Video Dataset (C3VD) acquired with a high\ndefinition clinical colonoscope and high-fidelity colon models for benchmarking\ncomputer vision methods in colonoscopy. We introduce a novel multimodal 2D-3D\nregistration technique to register optical video sequences with ground truth\nrendered views of a known 3D model. The different modalities are registered by\ntransforming optical images to depth maps with a Generative Adversarial Network\nand aligning edge features with an evolutionary optimizer. This registration\nmethod achieves an average translation error of 0.321 millimeters and an\naverage rotation error of 0.159 degrees in simulation experiments where\nerror-free ground truth is available. The method also leverages video\ninformation, improving registration accuracy by 55.6% for translation and 60.4%\nfor rotation compared to single frame registration. 22 short video sequences\nwere registered to generate 10,015 total frames with paired ground truth depth,\nsurface normals, optical flow, occlusion, six degree-of-freedom pose, coverage\nmaps, and 3D models. The dataset also includes screening videos acquired by a\ngastroenterologist with paired ground truth pose and 3D surface models. The\ndataset and registration source code are available at durr.jhu.edu/C3VD.\n","authors":["Taylor L. Bobrow","Mayank Golhar","Rohan Vijayan","Venkata S. Akshintala","Juan R. Garcia","Nicholas J. Durr"],"pdf_url":"https://arxiv.org/pdf/2206.08903v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02423v1","updated":"2023-09-05T17:51:16Z","published":"2023-09-05T17:51:16Z","title":"EgoPCA: A New Framework for Egocentric Hand-Object Interaction\n Understanding","summary":" With the surge in attention to Egocentric Hand-Object Interaction (Ego-HOI),\nlarge-scale datasets such as Ego4D and EPIC-KITCHENS have been proposed.\nHowever, most current research is built on resources derived from third-person\nvideo action recognition. This inherent domain gap between first- and\nthird-person action videos, which have not been adequately addressed before,\nmakes current Ego-HOI suboptimal. This paper rethinks and proposes a new\nframework as an infrastructure to advance Ego-HOI recognition by Probing,\nCuration and Adaption (EgoPCA). We contribute comprehensive pre-train sets,\nbalanced test sets and a new baseline, which are complete with a\ntraining-finetuning strategy. With our new framework, we not only achieve\nstate-of-the-art performance on Ego-HOI benchmarks but also build several new\nand effective mechanisms and settings to advance further research. We believe\nour data and the findings will pave a new way for Ego-HOI understanding. Code\nand data are available at https://mvig-rhos.com/ego_pca\n","authors":["Yue Xu","Yong-Lu Li","Zhemin Huang","Michael Xu Liu","Cewu Lu","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2309.02423v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02420v1","updated":"2023-09-05T17:50:36Z","published":"2023-09-05T17:50:36Z","title":"Doppelgangers: Learning to Disambiguate Images of Similar Structures","summary":" We consider the visual disambiguation task of determining whether a pair of\nvisually similar images depict the same or distinct 3D surfaces (e.g., the same\nor opposite sides of a symmetric building). Illusory image matches, where two\nimages observe distinct but visually similar 3D surfaces, can be challenging\nfor humans to differentiate, and can also lead 3D reconstruction algorithms to\nproduce erroneous results. We propose a learning-based approach to visual\ndisambiguation, formulating it as a binary classification task on image pairs.\nTo that end, we introduce a new dataset for this problem, Doppelgangers, which\nincludes image pairs of similar structures with ground truth labels. We also\ndesign a network architecture that takes the spatial distribution of local\nkeypoints and matches as input, allowing for better reasoning about both local\nand global cues. Our evaluation shows that our method can distinguish illusory\nmatches in difficult cases, and can be integrated into SfM pipelines to produce\ncorrect, disambiguated 3D reconstructions. See our project page for our code,\ndatasets, and more results: http://doppelgangers-3d.github.io/.\n","authors":["Ruojin Cai","Joseph Tung","Qianqian Wang","Hadar Averbuch-Elor","Bharath Hariharan","Noah Snavely"],"pdf_url":"https://arxiv.org/pdf/2309.02420v1.pdf","comment":"Published in ICCV 2023 (Oral); Project page:\n http://doppelgangers-3d.github.io/"},{"id":"http://arxiv.org/abs/2309.02405v1","updated":"2023-09-05T17:36:40Z","published":"2023-09-05T17:36:40Z","title":"Generating Realistic Images from In-the-wild Sounds","summary":" Representing wild sounds as images is an important but challenging task due\nto the lack of paired datasets between sound and images and the significant\ndifferences in the characteristics of these two modalities. Previous studies\nhave focused on generating images from sound in limited categories or music. In\nthis paper, we propose a novel approach to generate images from in-the-wild\nsounds. First, we convert sound into text using audio captioning. Second, we\npropose audio attention and sentence attention to represent the rich\ncharacteristics of sound and visualize the sound. Lastly, we propose a direct\nsound optimization with CLIPscore and AudioCLIP and generate images with a\ndiffusion-based model. In experiments, it shows that our model is able to\ngenerate high quality images from wild sounds and outperforms baselines in both\nquantitative and qualitative evaluations on wild audio datasets.\n","authors":["Taegyeong Lee","Jeonghun Kang","Hyeonyu Kim","Taehwan Kim"],"pdf_url":"https://arxiv.org/pdf/2309.02405v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02404v1","updated":"2023-09-05T17:36:34Z","published":"2023-09-05T17:36:34Z","title":"Voice Morphing: Two Identities in One Voice","summary":" In a biometric system, each biometric sample or template is typically\nassociated with a single identity. However, recent research has demonstrated\nthe possibility of generating \"morph\" biometric samples that can successfully\nmatch more than a single identity. Morph attacks are now recognized as a\npotential security threat to biometric systems. However, most morph attacks\nhave been studied on biometric modalities operating in the image domain, such\nas face, fingerprint, and iris. In this preliminary work, we introduce Voice\nIdentity Morphing (VIM) - a voice-based morph attack that can synthesize speech\nsamples that impersonate the voice characteristics of a pair of individuals.\nOur experiments evaluate the vulnerabilities of two popular speaker recognition\nsystems, ECAPA-TDNN and x-vector, to VIM, with a success rate (MMPMR) of over\n80% at a false match rate of 1% on the Librispeech dataset.\n","authors":["Sushanta K. Pani","Anurag Chowdhury","Morgan Sandler","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2309.02404v1.pdf","comment":"Accepted oral paper at BIOSIG 2023"},{"id":"http://arxiv.org/abs/2309.02401v1","updated":"2023-09-05T17:27:16Z","published":"2023-09-05T17:27:16Z","title":"Prototype-based Dataset Comparison","summary":" Dataset summarisation is a fruitful approach to dataset inspection. However,\nwhen applied to a single dataset the discovery of visual concepts is restricted\nto those most prominent. We argue that a comparative approach can expand upon\nthis paradigm to enable richer forms of dataset inspection that go beyond the\nmost prominent concepts. To enable dataset comparison we present a module that\nlearns concept-level prototypes across datasets. We leverage self-supervised\nlearning to discover these prototypes without supervision, and we demonstrate\nthe benefits of our approach in two case-studies. Our findings show that\ndataset comparison extends dataset inspection and we hope to encourage more\nworks in this direction. Code and usage instructions available at\nhttps://github.com/Nanne/ProtoSim\n","authors":["Nanne van Noord"],"pdf_url":"https://arxiv.org/pdf/2309.02401v1.pdf","comment":"To be presented at ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02356v1","updated":"2023-09-05T16:11:54Z","published":"2023-09-05T16:11:54Z","title":"STEP -- Towards Structured Scene-Text Spotting","summary":" We introduce the structured scene-text spotting task, which requires a\nscene-text OCR system to spot text in the wild according to a query regular\nexpression. Contrary to generic scene text OCR, structured scene-text spotting\nseeks to dynamically condition both scene text detection and recognition on\nuser-provided regular expressions. To tackle this task, we propose the\nStructured TExt sPotter (STEP), a model that exploits the provided text\nstructure to guide the OCR process. STEP is able to deal with regular\nexpressions that contain spaces and it is not bound to detection at the\nword-level granularity. Our approach enables accurate zero-shot structured text\nspotting in a wide variety of real-world reading scenarios and is solely\ntrained on publicly available data. To demonstrate the effectiveness of our\napproach, we introduce a new challenging test dataset that contains several\ntypes of out-of-vocabulary structured text, reflecting important reading\napplications of fields such as prices, dates, serial numbers, license plates\netc. We demonstrate that STEP can provide specialised OCR performance on demand\nin all tested scenarios.\n","authors":["Sergi Garcia-Bordils","Dimosthenis Karatzas","Marçal Rusiñol"],"pdf_url":"https://arxiv.org/pdf/2309.02356v1.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.02340v1","updated":"2023-09-05T15:57:23Z","published":"2023-09-05T15:57:23Z","title":"Generating Infinite-Resolution Texture using GANs with Patch-by-Patch\n Paradigm","summary":" In this paper, we introduce a novel approach for generating texture images of\ninfinite resolutions using Generative Adversarial Networks (GANs) based on a\npatch-by-patch paradigm. Existing texture synthesis techniques often rely on\ngenerating a large-scale texture using a one-forward pass to the generating\nmodel, this limits the scalability and flexibility of the generated images. In\ncontrast, the proposed approach trains GANs models on a single texture image to\ngenerate relatively small patches that are locally correlated and can be\nseamlessly concatenated to form a larger image while using a constant GPU\nmemory footprint. Our method learns the local texture structure and is able to\ngenerate arbitrary-size textures, while also maintaining coherence and\ndiversity. The proposed method relies on local padding in the generator to\nensure consistency between patches and utilizes spatial stochastic modulation\nto allow for local variations and diversity within the large-scale image.\nExperimental results demonstrate superior scalability compared to existing\napproaches while maintaining visual coherence of generated textures.\n","authors":["Alhasan Abdellatif","Ahmed H. Elsheikh"],"pdf_url":"https://arxiv.org/pdf/2309.02340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02335v1","updated":"2023-09-05T15:54:35Z","published":"2023-09-05T15:54:35Z","title":"DEEPBEAS3D: Deep Learning and B-Spline Explicit Active Surfaces","summary":" Deep learning-based automatic segmentation methods have become\nstate-of-the-art. However, they are often not robust enough for direct clinical\napplication, as domain shifts between training and testing data affect their\nperformance. Failure in automatic segmentation can cause sub-optimal results\nthat require correction. To address these problems, we propose a novel 3D\nextension of an interactive segmentation framework that represents a\nsegmentation from a convolutional neural network (CNN) as a B-spline explicit\nactive surface (BEAS). BEAS ensures segmentations are smooth in 3D space,\nincreasing anatomical plausibility, while allowing the user to precisely edit\nthe 3D surface. We apply this framework to the task of 3D segmentation of the\nanal sphincter complex (AS) from transperineal ultrasound (TPUS) images, and\ncompare it to the clinical tool used in the pelvic floor disorder clinic (4D\nView VOCAL, GE Healthcare; Zipf, Austria). Experimental results show that: 1)\nthe proposed framework gives the user explicit control of the surface contour;\n2) the perceived workload calculated via the NASA-TLX index was reduced by 30%\ncompared to VOCAL; and 3) it required 7 0% (170 seconds) less user time than\nVOCAL (p< 0.00001)\n","authors":["Helena Williams","João Pedrosa","Muhammad Asad","Laura Cattani","Tom Vercauteren","Jan Deprest","Jan D'hooge"],"pdf_url":"https://arxiv.org/pdf/2309.02335v1.pdf","comment":"4 pages, 3 figures, 1 table, conference"},{"id":"http://arxiv.org/abs/2304.04278v2","updated":"2023-09-05T15:37:32Z","published":"2023-04-09T16:48:26Z","title":"Point-SLAM: Dense Neural Point Cloud-based SLAM","summary":" We propose a dense neural simultaneous localization and mapping (SLAM)\napproach for monocular RGBD input which anchors the features of a neural scene\nrepresentation in a point cloud that is iteratively generated in an\ninput-dependent data-driven manner. We demonstrate that both tracking and\nmapping can be performed with the same point-based neural scene representation\nby minimizing an RGBD-based re-rendering loss. In contrast to recent dense\nneural SLAM methods which anchor the scene features in a sparse grid, our\npoint-based approach allows dynamically adapting the anchor point density to\nthe information density of the input. This strategy reduces runtime and memory\nusage in regions with fewer details and dedicates higher point density to\nresolve fine details. Our approach performs either better or competitive to\nexisting dense neural RGBD SLAM methods in tracking, mapping and rendering\naccuracy on the Replica, TUM-RGBD and ScanNet datasets. The source code is\navailable at https://github.com/eriksandstroem/Point-SLAM.\n","authors":["Erik Sandström","Yue Li","Luc Van Gool","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2304.04278v2.pdf","comment":"ICCV 2023. 18 Pages, 12 Figures"},{"id":"http://arxiv.org/abs/2309.02318v1","updated":"2023-09-05T15:34:37Z","published":"2023-09-05T15:34:37Z","title":"TiAVox: Time-aware Attenuation Voxels for Sparse-view 4D DSA\n Reconstruction","summary":" Four-dimensional Digital Subtraction Angiography (4D DSA) plays a critical\nrole in the diagnosis of many medical diseases, such as Arteriovenous\nMalformations (AVM) and Arteriovenous Fistulas (AVF). Despite its significant\napplication value, the reconstruction of 4D DSA demands numerous views to\neffectively model the intricate vessels and radiocontrast flow, thereby\nimplying a significant radiation dose. To address this high radiation issue, we\npropose a Time-aware Attenuation Voxel (TiAVox) approach for sparse-view 4D DSA\nreconstruction, which paves the way for high-quality 4D imaging. Additionally,\n2D and 3D DSA imaging results can be generated from the reconstructed 4D DSA\nimages. TiAVox introduces 4D attenuation voxel grids, which reflect attenuation\nproperties from both spatial and temporal dimensions. It is optimized by\nminimizing discrepancies between the rendered images and sparse 2D DSA images.\nWithout any neural network involved, TiAVox enjoys specific physical\ninterpretability. The parameters of each learnable voxel represent the\nattenuation coefficients. We validated the TiAVox approach on both clinical and\nsimulated datasets, achieving a 31.23 Peak Signal-to-Noise Ratio (PSNR) for\nnovel view synthesis using only 30 views on the clinically sourced dataset,\nwhereas traditional Feldkamp-Davis-Kress methods required 133 views. Similarly,\nwith merely 10 views from the synthetic dataset, TiAVox yielded a PSNR of 34.32\nfor novel view synthesis and 41.40 for 3D reconstruction. We also executed\nablation studies to corroborate the essential components of TiAVox. The code\nwill be publically available.\n","authors":["Zhenghong Zhou","Huangxuan Zhao","Jiemin Fang","Dongqiao Xiang","Lei Chen","Lingxia Wu","Feihong Wu","Wenyu Liu","Chuansheng Zheng","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2309.02318v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.02301v1","updated":"2023-09-05T15:06:37Z","published":"2023-09-05T15:06:37Z","title":"CIEM: Contrastive Instruction Evaluation Method for Better Instruction\n Tuning","summary":" Nowadays, the research on Large Vision-Language Models (LVLMs) has been\nsignificantly promoted thanks to the success of Large Language Models (LLM).\nNevertheless, these Vision-Language Models (VLMs) are suffering from the\ndrawback of hallucination -- due to insufficient understanding of vision and\nlanguage modalities, VLMs may generate incorrect perception information when\ndoing downstream applications, for example, captioning a non-existent entity.\nTo address the hallucination phenomenon, on the one hand, we introduce a\nContrastive Instruction Evaluation Method (CIEM), which is an automatic\npipeline that leverages an annotated image-text dataset coupled with an LLM to\ngenerate factual/contrastive question-answer pairs for the evaluation of the\nhallucination of VLMs. On the other hand, based on CIEM, we further propose a\nnew instruction tuning method called CIT (the abbreviation of Contrastive\nInstruction Tuning) to alleviate the hallucination of VLMs by automatically\nproducing high-quality factual/contrastive question-answer pairs and\ncorresponding justifications for model tuning. Through extensive experiments on\nCIEM and CIT, we pinpoint the hallucination issues commonly present in existing\nVLMs, the disability of the current instruction-tuning dataset to handle the\nhallucination phenomenon and the superiority of CIT-tuned VLMs over both CIEM\nand public datasets.\n","authors":["Hongyu Hu","Jiyuan Zhang","Minyi Zhao","Zhenbang Sun"],"pdf_url":"https://arxiv.org/pdf/2309.02301v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02290v1","updated":"2023-09-05T14:52:38Z","published":"2023-09-05T14:52:38Z","title":"ATM: Action Temporality Modeling for Video Question Answering","summary":" Despite significant progress in video question answering (VideoQA), existing\nmethods fall short of questions that require causal/temporal reasoning across\nframes. This can be attributed to imprecise motion representations. We\nintroduce Action Temporality Modeling (ATM) for temporality reasoning via\nthree-fold uniqueness: (1) rethinking the optical flow and realizing that\noptical flow is effective in capturing the long horizon temporality reasoning;\n(2) training the visual-text embedding by contrastive learning in an\naction-centric manner, leading to better action representations in both vision\nand text modalities; and (3) preventing the model from answering the question\ngiven the shuffled video in the fine-tuning stage, to avoid spurious\ncorrelation between appearance and motion and hence ensure faithful temporality\nreasoning. In the experiments, we show that ATM outperforms previous approaches\nin terms of the accuracy on multiple VideoQAs and exhibits better true\ntemporality reasoning ability.\n","authors":["Junwen Chen","Jie Zhu","Yu Kong"],"pdf_url":"https://arxiv.org/pdf/2309.02290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02286v1","updated":"2023-09-05T14:45:54Z","published":"2023-09-05T14:45:54Z","title":"Haystack: A Panoptic Scene Graph Dataset to Evaluate Rare Predicate\n Classes","summary":" Current scene graph datasets suffer from strong long-tail distributions of\ntheir predicate classes. Due to a very low number of some predicate classes in\nthe test sets, no reliable metrics can be retrieved for the rarest classes. We\nconstruct a new panoptic scene graph dataset and a set of metrics that are\ndesigned as a benchmark for the predictive performance especially on rare\npredicate classes. To construct the new dataset, we propose a model-assisted\nannotation pipeline that efficiently finds rare predicate classes that are\nhidden in a large set of images like needles in a haystack.\n Contrary to prior scene graph datasets, Haystack contains explicit negative\nannotations, i.e. annotations that a given relation does not have a certain\npredicate class. Negative annotations are helpful especially in the field of\nscene graph generation and open up a whole new set of possibilities to improve\ncurrent scene graph generation models.\n Haystack is 100% compatible with existing panoptic scene graph datasets and\ncan easily be integrated with existing evaluation pipelines. Our dataset and\ncode can be found here: https://lorjul.github.io/haystack/. It includes\nannotation files and simple to use scripts and utilities, to help with\nintegrating our dataset in existing work.\n","authors":["Julian Lorenz","Florian Barthel","Daniel Kienzle","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2309.02286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09067v2","updated":"2023-09-05T14:44:04Z","published":"2023-06-15T11:49:44Z","title":"2nd Place Winning Solution for the CVPR2023 Visual Anomaly and Novelty\n Detection Challenge: Multimodal Prompting for Data-centric Anomaly Detection","summary":" This technical report introduces the winning solution of the team Segment Any\nAnomaly for the CVPR2023 Visual Anomaly and Novelty Detection (VAND) challenge.\nGoing beyond uni-modal prompt, e.g., language prompt, we present a novel\nframework, i.e., Segment Any Anomaly + (SAA$+$), for zero-shot anomaly\nsegmentation with multi-modal prompts for the regularization of cascaded modern\nfoundation models. Inspired by the great zero-shot generalization ability of\nfoundation models like Segment Anything, we first explore their assembly (SAA)\nto leverage diverse multi-modal prior knowledge for anomaly localization.\nSubsequently, we further introduce multimodal prompts (SAA$+$) derived from\ndomain expert knowledge and target image context to enable the non-parameter\nadaptation of foundation models to anomaly segmentation. The proposed SAA$+$\nmodel achieves state-of-the-art performance on several anomaly segmentation\nbenchmarks, including VisA and MVTec-AD, in the zero-shot setting. We will\nrelease the code of our winning solution for the CVPR2023 VAN.\n","authors":["Yunkang Cao","Xiaohao Xu","Chen Sun","Yuqi Cheng","Liang Gao","Weiming Shen"],"pdf_url":"https://arxiv.org/pdf/2306.09067v2.pdf","comment":"The first two author contribute equally. CVPR workshop challenge\n report. arXiv admin note: substantial text overlap with arXiv:2305.10724"},{"id":"http://arxiv.org/abs/2309.02270v1","updated":"2023-09-05T14:33:56Z","published":"2023-09-05T14:33:56Z","title":"SAM-Deblur: Let Segment Anything Boost Image Deblurring","summary":" Image deblurring is a critical task in the field of image restoration, aiming\nto eliminate blurring artifacts. However, the challenge of addressing\nnon-uniform blurring leads to an ill-posed problem, which limits the\ngeneralization performance of existing deblurring models. To solve the problem,\nwe propose a framework SAM-Deblur, integrating prior knowledge from the Segment\nAnything Model (SAM) into the deblurring task for the first time. In\nparticular, SAM-Deblur is divided into three stages. First, We preprocess the\nblurred images, obtain image masks via SAM, and propose a mask dropout method\nfor training to enhance model robustness. Then, to fully leverage the\nstructural priors generated by SAM, we propose a Mask Average Pooling (MAP)\nunit specifically designed to average SAM-generated segmented areas, serving as\na plug-and-play component which can be seamlessly integrated into existing\ndeblurring networks. Finally, we feed the fused features generated by the MAP\nUnit into the deblurring model to obtain a sharp image. Experimental results on\nthe RealBlurJ, ReloBlur, and REDS datasets reveal that incorporating our\nmethods improves NAFNet's PSNR by 0.05, 0.96, and 7.03, respectively. Code will\nbe available at \\href{https://github.com/HPLQAQ/SAM-Deblur}{SAM-Deblur}.\n","authors":["Siwei Li","Mingxuan Liu","Yating Zhang","Shu Chen","Haoxiang Li","Hong Chen","Zifei Dou"],"pdf_url":"https://arxiv.org/pdf/2309.02270v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2306.01188v3","updated":"2023-09-05T14:30:32Z","published":"2023-06-01T22:57:32Z","title":"Event-based Stereo Visual Odometry with Native Temporal Resolution via\n Continuous-time Gaussian Process Regression","summary":" Event-based cameras asynchronously capture individual visual changes in a\nscene. This makes them more robust than traditional frame-based cameras to\nhighly dynamic motions and poor illumination. It also means that every\nmeasurement in a scene can occur at a unique time.\n Handling these different measurement times is a major challenge of using\nevent-based cameras. It is often addressed in visual odometry (VO) pipelines by\napproximating temporally close measurements as occurring at one common time.\nThis grouping simplifies the estimation problem but, absent additional sensors,\nsacrifices the inherent temporal resolution of event-based cameras.\n This paper instead presents a complete stereo VO pipeline that estimates\ndirectly with individual event-measurement times without requiring any grouping\nor approximation in the estimation state. It uses continuous-time trajectory\nestimation to maintain the temporal fidelity and asynchronous nature of\nevent-based cameras through Gaussian process regression with a physically\nmotivated prior. Its performance is evaluated on the MVSEC dataset, where it\nachieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences,\noutperforming the existing publicly available event-based stereo VO pipeline by\ntwo and four times, respectively.\n","authors":["Jianeng Wang","Jonathan D. Gammell"],"pdf_url":"https://arxiv.org/pdf/2306.01188v3.pdf","comment":"To appear in IEEE Robotics and Automation Letters (RA-L). 8 pages, 4\n figures. DOI: 10.1109/LRA.2023.3311374"},{"id":"http://arxiv.org/abs/2211.12506v2","updated":"2023-09-05T14:30:14Z","published":"2022-11-22T01:48:25Z","title":"Dynamic Loss For Robust Learning","summary":" Label noise and class imbalance commonly coexist in real-world data. Previous\nworks for robust learning, however, usually address either one type of the data\nbiases and underperform when facing them both. To mitigate this gap, this work\npresents a novel meta-learning based dynamic loss that automatically adjusts\nthe objective functions with the training process to robustly learn a\nclassifier from long-tailed noisy data. Concretely, our dynamic loss comprises\na label corrector and a margin generator, which respectively correct noisy\nlabels and generate additive per-class classification margins by perceiving the\nunderlying data distribution as well as the learning state of the classifier.\nEquipped with a new hierarchical sampling strategy that enriches a small amount\nof unbiased metadata with diverse and hard samples, the two components in the\ndynamic loss are optimized jointly through meta-learning and cultivate the\nclassifier to well adapt to clean and balanced test data. Extensive experiments\nshow our method achieves state-of-the-art accuracy on multiple real-world and\nsynthetic datasets with various types of data biases, including CIFAR-10/100,\nAnimal-10N, ImageNet-LT, and Webvision. Code will soon be publicly available.\n","authors":["Shenwang Jiang","Jianan Li","Jizhou Zhang","Ying Wang","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2211.12506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00007v2","updated":"2023-09-05T14:04:14Z","published":"2023-07-27T13:18:47Z","title":"When Measures are Unreliable: Imperceptible Adversarial Perturbations\n toward Top-$k$ Multi-Label Learning","summary":" With the great success of deep neural networks, adversarial learning has\nreceived widespread attention in various studies, ranging from multi-class\nlearning to multi-label learning. However, existing adversarial attacks toward\nmulti-label learning only pursue the traditional visual imperceptibility but\nignore the new perceptible problem coming from measures such as Precision@$k$\nand mAP@$k$. Specifically, when a well-trained multi-label classifier performs\nfar below the expectation on some samples, the victim can easily realize that\nthis performance degeneration stems from attack, rather than the model itself.\nTherefore, an ideal multi-labeling adversarial attack should manage to not only\ndeceive visual perception but also evade monitoring of measures. To this end,\nthis paper first proposes the concept of measure imperceptibility. Then, a\nnovel loss function is devised to generate such adversarial perturbations that\ncould achieve both visual and measure imperceptibility. Furthermore, an\nefficient algorithm, which enjoys a convex objective, is established to\noptimize this objective. Finally, extensive experiments on large-scale\nbenchmark datasets, such as PASCAL VOC 2012, MS COCO, and NUS WIDE, demonstrate\nthe superiority of our proposed method in attacking the top-$k$ multi-label\nsystems.\n","authors":["Yuchen Sun","Qianqian Xu","Zitai Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2309.00007v2.pdf","comment":"22 pages, 7 figures, accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.02244v1","updated":"2023-09-05T13:52:43Z","published":"2023-09-05T13:52:43Z","title":"Augmenting Chest X-ray Datasets with Non-Expert Annotations","summary":" The advancement of machine learning algorithms in medical image analysis\nrequires the expansion of training datasets. A popular and cost-effective\napproach is automated annotation extraction from free-text medical reports,\nprimarily due to the high costs associated with expert clinicians annotating\nchest X-ray images. However, it has been shown that the resulting datasets are\nsusceptible to biases and shortcuts. Another strategy to increase the size of a\ndataset is crowdsourcing, a widely adopted practice in general computer vision\nwith some success in medical image analysis. In a similar vein to\ncrowdsourcing, we enhance two publicly available chest X-ray datasets by\nincorporating non-expert annotations. However, instead of using diagnostic\nlabels, we annotate shortcuts in the form of tubes. We collect 3.5k chest drain\nannotations for CXR14, and 1k annotations for 4 different tube types in\nPadChest. We train a chest drain detector with the non-expert annotations that\ngeneralizes well to expert labels. Moreover, we compare our annotations to\nthose provided by experts and show \"moderate\" to \"almost perfect\" agreement.\nFinally, we present a pathology agreement study to raise awareness about ground\ntruth annotations. We make our annotations and code available.\n","authors":["Cathrine Damgaard","Trine Naja Eriksen","Dovile Juodelyte","Veronika Cheplygina","Amelia Jiménez-Sánchez"],"pdf_url":"https://arxiv.org/pdf/2309.02244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02230v1","updated":"2023-09-05T13:36:40Z","published":"2023-09-05T13:36:40Z","title":"DCP-Net: A Distributed Collaborative Perception Network for Remote\n Sensing Semantic Segmentation","summary":" Onboard intelligent processing is widely applied in emergency tasks in the\nfield of remote sensing. However, it is predominantly confined to an individual\nplatform with a limited observation range as well as susceptibility to\ninterference, resulting in limited accuracy. Considering the current state of\nmulti-platform collaborative observation, this article innovatively presents a\ndistributed collaborative perception network called DCP-Net. Firstly, the\nproposed DCP-Net helps members to enhance perception performance by integrating\nfeatures from other platforms. Secondly, a self-mutual information match module\nis proposed to identify collaboration opportunities and select suitable\npartners, prioritizing critical collaborative features and reducing redundant\ntransmission cost. Thirdly, a related feature fusion module is designed to\naddress the misalignment between local and collaborative features, improving\nthe quality of fused features for the downstream task. We conduct extensive\nexperiments and visualization analyses using three semantic segmentation\ndatasets, including Potsdam, iSAID and DFC23. The results demonstrate that\nDCP-Net outperforms the existing methods comprehensively, improving mIoU by\n2.61%~16.89% at the highest collaboration efficiency, which promotes the\nperformance to a state-of-the-art level.\n","authors":["Zhechao Wang","Peirui Cheng","Shujing Duan","Kaiqiang Chen","Zhirui Wang","Xinming Li","Xian Sun"],"pdf_url":"https://arxiv.org/pdf/2309.02230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02224v1","updated":"2023-09-05T13:27:19Z","published":"2023-09-05T13:27:19Z","title":"Dense Object Grounding in 3D Scenes","summary":" Localizing objects in 3D scenes according to the semantics of a given natural\nlanguage is a fundamental yet important task in the field of multimedia\nunderstanding, which benefits various real-world applications such as robotics\nand autonomous driving. However, the majority of existing 3D object grounding\nmethods are restricted to a single-sentence input describing an individual\nobject, which cannot comprehend and reason more contextualized descriptions of\nmultiple objects in more practical 3D cases. To this end, we introduce a new\nchallenging task, called 3D Dense Object Grounding (3D DOG), to jointly\nlocalize multiple objects described in a more complicated paragraph rather than\na single sentence. Instead of naively localizing each sentence-guided object\nindependently, we found that dense objects described in the same paragraph are\noften semantically related and spatially located in a focused region of the 3D\nscene. To explore such semantic and spatial relationships of densely referred\nobjects for more accurate localization, we propose a novel Stacked Transformer\nbased framework for 3D DOG, named 3DOGSFormer. Specifically, we first devise a\ncontextual query-driven local transformer decoder to generate initial grounding\nproposals for each target object. Then, we employ a proposal-guided global\ntransformer decoder that exploits the local object features to learn their\ncorrelation for further refining initial grounding proposals. Extensive\nexperiments on three challenging benchmarks (Nr3D, Sr3D, and ScanRefer) show\nthat our proposed 3DOGSFormer outperforms state-of-the-art 3D single-object\ngrounding methods and their dense-object variants by significant margins.\n","authors":["Wencan Huang","Daizong Liu","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2309.02224v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.02218v1","updated":"2023-09-05T13:22:41Z","published":"2023-09-05T13:22:41Z","title":"Robustness and Generalizability of Deepfake Detection: A Study with\n Diffusion Models","summary":" The rise of deepfake images, especially of well-known personalities, poses a\nserious threat to the dissemination of authentic information. To tackle this,\nwe present a thorough investigation into how deepfakes are produced and how\nthey can be identified. The cornerstone of our research is a rich collection of\nartificial celebrity faces, titled DeepFakeFace (DFF). We crafted the DFF\ndataset using advanced diffusion models and have shared it with the community\nthrough online platforms. This data serves as a robust foundation to train and\ntest algorithms designed to spot deepfakes. We carried out a thorough review of\nthe DFF dataset and suggest two evaluation methods to gauge the strength and\nadaptability of deepfake recognition tools. The first method tests whether an\nalgorithm trained on one type of fake images can recognize those produced by\nother methods. The second evaluates the algorithm's performance with imperfect\nimages, like those that are blurry, of low quality, or compressed. Given varied\nresults across deepfake methods and image changes, our findings stress the need\nfor better deepfake detectors. Our DFF dataset and tests aim to boost the\ndevelopment of more effective tools against deepfakes.\n","authors":["Haixu Song","Shiyu Huang","Yinpeng Dong","Wei-Wei Tu"],"pdf_url":"https://arxiv.org/pdf/2309.02218v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.02217v1","updated":"2023-09-05T13:22:16Z","published":"2023-09-05T13:22:16Z","title":"Advanced Underwater Image Restoration in Complex Illumination Conditions","summary":" Underwater image restoration has been a challenging problem for decades since\nthe advent of underwater photography. Most solutions focus on shallow water\nscenarios, where the scene is uniformly illuminated by the sunlight. However,\nthe vast majority of uncharted underwater terrain is located beyond 200 meters\ndepth where natural light is scarce and artificial illumination is needed. In\nsuch cases, light sources co-moving with the camera, dynamically change the\nscene appearance, which make shallow water restoration methods inadequate. In\nparticular for multi-light source systems (composed of dozens of LEDs\nnowadays), calibrating each light is time-consuming, error-prone and tedious,\nand we observe that only the integrated illumination within the viewing volume\nof the camera is critical, rather than the individual light sources. The key\nidea of this paper is therefore to exploit the appearance changes of objects or\nthe seafloor, when traversing the viewing frustum of the camera. Through new\nconstraints assuming Lambertian surfaces, corresponding image pixels constrain\nthe light field in front of the camera, and for each voxel a signal factor and\na backscatter value are stored in a volumetric grid that can be used for very\nefficient image restoration of camera-light platforms, which facilitates\nconsistently texturing large 3D models and maps that would otherwise be\ndominated by lighting and medium artifacts. To validate the effectiveness of\nour approach, we conducted extensive experiments on simulated and real-world\ndatasets. The results of these experiments demonstrate the robustness of our\napproach in restoring the true albedo of objects, while mitigating the\ninfluence of lighting and medium effects. Furthermore, we demonstrate our\napproach can be readily extended to other scenarios, including in-air imaging\nwith artificial illumination or other similar cases.\n","authors":["Yifan Song","Mengkun She","Kevin Köser"],"pdf_url":"https://arxiv.org/pdf/2309.02217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02210v1","updated":"2023-09-05T13:18:52Z","published":"2023-09-05T13:18:52Z","title":"Continual Cross-Dataset Adaptation in Road Surface Classification","summary":" Accurate road surface classification is crucial for autonomous vehicles (AVs)\nto optimize driving conditions, enhance safety, and enable advanced road\nmapping. However, deep learning models for road surface classification suffer\nfrom poor generalization when tested on unseen datasets. To update these models\nwith new information, also the original training dataset must be taken into\naccount, in order to avoid catastrophic forgetting. This is, however,\ninefficient if not impossible, e.g., when the data is collected in streams or\nlarge amounts. To overcome this limitation and enable fast and efficient\ncross-dataset adaptation, we propose to employ continual learning finetuning\nmethods designed to retain past knowledge while adapting to new data, thus\neffectively avoiding forgetting. Experimental results demonstrate the\nsuperiority of this approach over naive finetuning, achieving performance close\nto fresh retraining. While solving this known problem, we also provide a\ngeneral description of how the same technique can be adopted in other AV\nscenarios. We highlight the potential computational and economic benefits that\na continual-based adaptation can bring to the AV industry, while also reducing\ngreenhouse emissions due to unnecessary joint retraining.\n","authors":["Paolo Cudrano","Matteo Bellusci","Giuseppe Macino","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2309.02210v1.pdf","comment":"To be published in Proceedings of 26th IEEE International Conference\n on Intelligent Transportation Systems (ITSC 2023)"},{"id":"http://arxiv.org/abs/2309.02197v1","updated":"2023-09-05T12:57:32Z","published":"2023-09-05T12:57:32Z","title":"Delving into Ipsilateral Mammogram Assessment under Multi-View Network","summary":" In many recent years, multi-view mammogram analysis has been focused widely\non AI-based cancer assessment. In this work, we aim to explore diverse fusion\nstrategies (average and concatenate) and examine the model's learning behavior\nwith varying individuals and fusion pathways, involving Coarse Layer and Fine\nLayer. The Ipsilateral Multi-View Network, comprising five fusion types (Pre,\nEarly, Middle, Last, and Post Fusion) in ResNet-18, is employed. Notably, the\nMiddle Fusion emerges as the most balanced and effective approach, enhancing\ndeep-learning models' generalization performance by +5.29\\% (concatenate) and\n+5.9\\% (average) in VinDr-Mammo dataset and +2.03\\% (concatenate) and +3\\%\n(average) in CMMD dataset on macro F1-Score. The paper emphasizes the crucial\nrole of layer assignment in multi-view network extraction with various\nstrategies.\n","authors":["Thai Ngoc Toan Truong","Thanh-Huy Nguyen","Ba Thinh Lam","Vu Minh Duy Nguyen","Hong Phuc Nguyen"],"pdf_url":"https://arxiv.org/pdf/2309.02197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02190v1","updated":"2023-09-05T12:48:25Z","published":"2023-09-05T12:48:25Z","title":"Exchanging-based Multimodal Fusion with Transformer","summary":" We study the problem of multimodal fusion in this paper. Recent\nexchanging-based methods have been proposed for vision-vision fusion, which aim\nto exchange embeddings learned from one modality to the other. However, most of\nthem project inputs of multimodalities into different low-dimensional spaces\nand cannot be applied to the sequential input data. To solve these issues, in\nthis paper, we propose a novel exchanging-based multimodal fusion model MuSE\nfor text-vision fusion based on Transformer. We first use two encoders to\nseparately map multimodal inputs into different low-dimensional spaces. Then we\nemploy two decoders to regularize the embeddings and pull them into the same\nspace. The two decoders capture the correlations between texts and images with\nthe image captioning task and the text-to-image generation task, respectively.\nFurther, based on the regularized embeddings, we present CrossTransformer,\nwhich uses two Transformer encoders with shared parameters as the backbone\nmodel to exchange knowledge between multimodalities. Specifically,\nCrossTransformer first learns the global contextual information of the inputs\nin the shallow layers. After that, it performs inter-modal exchange by\nselecting a proportion of tokens in one modality and replacing their embeddings\nwith the average of embeddings in the other modality. We conduct extensive\nexperiments to evaluate the performance of MuSE on the Multimodal Named Entity\nRecognition task and the Multimodal Sentiment Analysis task. Our results show\nthe superiority of MuSE against other competitors. Our code and data are\nprovided at https://github.com/RecklessRonan/MuSE.\n","authors":["Renyu Zhu","Chengcheng Han","Yong Qian","Qiushi Sun","Xiang Li","Ming Gao","Xuezhi Cao","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2309.02190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02186v1","updated":"2023-09-05T12:44:57Z","published":"2023-09-05T12:44:57Z","title":"AniPortraitGAN: Animatable 3D Portrait Generation from 2D Image\n Collections","summary":" Previous animatable 3D-aware GANs for human generation have primarily focused\non either the human head or full body. However, head-only videos are relatively\nuncommon in real life, and full body generation typically does not deal with\nfacial expression control and still has challenges in generating high-quality\nresults. Towards applicable video avatars, we present an animatable 3D-aware\nGAN that generates portrait images with controllable facial expression, head\npose, and shoulder movements. It is a generative model trained on unstructured\n2D image collections without using 3D or video data. For the new task, we base\nour method on the generative radiance manifold representation and equip it with\nlearnable facial and head-shoulder deformations. A dual-camera rendering and\nadversarial learning scheme is proposed to improve the quality of the generated\nfaces, which is critical for portrait images. A pose deformation processing\nnetwork is developed to generate plausible deformations for challenging regions\nsuch as long hair. Experiments show that our method, trained on unstructured 2D\nimages, can generate diverse and high-quality 3D portraits with desired control\nover different properties.\n","authors":["Yue Wu","Sicheng Xu","Jianfeng Xiang","Fangyun Wei","Qifeng Chen","Jiaolong Yang","Xin Tong"],"pdf_url":"https://arxiv.org/pdf/2309.02186v1.pdf","comment":"SIGGRAPH Asia 2023. Project Page:\n https://yuewuhkust.github.io/AniPortraitGAN/"},{"id":"http://arxiv.org/abs/2309.02185v1","updated":"2023-09-05T12:42:26Z","published":"2023-09-05T12:42:26Z","title":"BEVTrack: A Simple Baseline for Point Cloud Tracking in Bird's-Eye-View","summary":" 3D single object tracking (SOT) in point clouds is still a challenging\nproblem due to appearance variation, distractors, and high sparsity of point\nclouds. Notably, in autonomous driving scenarios, the target object typically\nmaintains spatial adjacency across consecutive frames, predominantly moving\nhorizontally. This spatial continuity offers valuable prior knowledge for\ntarget localization. However, existing trackers, which often employ point-wise\nrepresentations, struggle to efficiently utilize this knowledge owing to the\nirregular format of such representations. Consequently, they require elaborate\ndesigns and solving multiple subtasks to establish spatial correspondence. In\nthis paper, we introduce BEVTrack, a simple yet strong baseline framework for\n3D SOT. After converting consecutive point clouds into the common\nBird's-Eye-View representation, BEVTrack inherently encodes spatial proximity\nand adeptly captures motion cues for tracking via a simple element-wise\noperation and convolutional layers. Additionally, to better deal with objects\nhaving diverse sizes and moving patterns, BEVTrack directly learns the\nunderlying motion distribution rather than making a fixed Laplacian or Gaussian\nassumption as in previous works. Without bells and whistles, BEVTrack achieves\nstate-of-the-art performance on KITTI and NuScenes datasets while maintaining a\nhigh inference speed of 122 FPS. The code will be released at\nhttps://github.com/xmm-prio/BEVTrack.\n","authors":["Yuxiang Yang","Yingqi Deng","Jiahao Nie","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.02185v1.pdf","comment":"Technical report. Work in progress. The code will be released at\n https://github.com/xmm-prio/BEVTrack"},{"id":"http://arxiv.org/abs/2306.00402v2","updated":"2023-09-05T12:37:40Z","published":"2023-06-01T07:06:43Z","title":"Discriminative Deep Feature Visualization for Explainable Face\n Recognition","summary":" Despite the huge success of deep convolutional neural networks in face\nrecognition (FR) tasks, current methods lack explainability for their\npredictions because of their \"black-box\" nature. In recent years, studies have\nbeen carried out to give an interpretation of the decision of a deep FR system.\nHowever, the affinity between the input facial image and the extracted deep\nfeatures has not been explored. This paper contributes to the problem of\nexplainable face recognition by first conceiving a face reconstruction-based\nexplanation module, which reveals the correspondence between the deep feature\nand the facial regions. To further interpret the decision of an FR model, a\nnovel visual saliency explanation algorithm has been proposed. It provides\ninsightful explanation by producing visual saliency maps that represent similar\nand dissimilar regions between input faces. A detailed analysis has been\npresented for the generated visual explanation to show the effectiveness of the\nproposed method.\n","authors":["Zewei Xu","Yuhang Lu","Touradj Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2306.00402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08665v2","updated":"2023-09-05T12:35:36Z","published":"2023-03-15T14:52:46Z","title":"Cross-resolution Face Recognition via Identity-Preserving Network and\n Knowledge Distillation","summary":" Cross-resolution face recognition has become a challenging problem for modern\ndeep face recognition systems. It aims at matching a low-resolution probe image\nwith high-resolution gallery images registered in a database. Existing methods\nmainly leverage prior information from high-resolution images by either\nreconstructing facial details with super-resolution techniques or learning a\nunified feature space. To address this challenge, this paper proposes a new\napproach that enforces the network to focus on the discriminative information\nstored in the low-frequency components of a low-resolution image. A\ncross-resolution knowledge distillation paradigm is first employed as the\nlearning framework. Then, an identity-preserving network, WaveResNet, and a\nwavelet similarity loss are designed to capture low-frequency details and boost\nperformance. Finally, an image degradation model is conceived to simulate more\nrealistic low-resolution training data. Consequently, extensive experimental\nresults show that the proposed method consistently outperforms the baseline\nmodel and other state-of-the-art methods across a variety of image resolutions.\n","authors":["Yuhang Lu","Touradj Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2303.08665v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02179v1","updated":"2023-09-05T12:33:05Z","published":"2023-09-05T12:33:05Z","title":"High-resolution 3D Maps of Left Atrial Displacements using an\n Unsupervised Image Registration Neural Network","summary":" Functional analysis of the left atrium (LA) plays an increasingly important\nrole in the prognosis and diagnosis of cardiovascular diseases.\nEchocardiography-based measurements of LA dimensions and strains are useful\nbiomarkers, but they provide an incomplete picture of atrial deformations.\nHigh-resolution dynamic magnetic resonance images (Cine MRI) offer the\nopportunity to examine LA motion and deformation in 3D, at higher spatial\nresolution and with full LA coverage. However, there are no dedicated tools to\nautomatically characterise LA motion in 3D. Thus, we propose a tool that\nautomatically segments the LA and extracts the displacement fields across the\ncardiac cycle. The pipeline is able to accurately track the LA wall across the\ncardiac cycle with an average Hausdorff distance of $2.51 \\pm 1.3~mm$ and Dice\nscore of $0.96 \\pm 0.02$.\n","authors":["Christoforos Galazis","Anil Anthony Bharath","Marta Varela"],"pdf_url":"https://arxiv.org/pdf/2309.02179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02169v1","updated":"2023-09-05T12:16:14Z","published":"2023-09-05T12:16:14Z","title":"Dual Relation Alignment for Composed Image Retrieval","summary":" Composed image retrieval, a task involving the search for a target image\nusing a reference image and a complementary text as the query, has witnessed\nsignificant advancements owing to the progress made in cross-modal modeling.\nUnlike the general image-text retrieval problem with only one alignment\nrelation, i.e., image-text, we argue for the existence of two types of\nrelations in composed image retrieval. The explicit relation pertains to the\nreference image & complementary text-target image, which is commonly exploited\nby existing methods. Besides this intuitive relation, the observations during\nour practice have uncovered another implicit yet crucial relation, i.e.,\nreference image & target image-complementary text, since we found that the\ncomplementary text can be inferred by studying the relation between the target\nimage and the reference image. Regrettably, existing methods largely focus on\nleveraging the explicit relation to learn their networks, while overlooking the\nimplicit relation. In response to this weakness, We propose a new framework for\ncomposed image retrieval, termed dual relation alignment, which integrates both\nexplicit and implicit relations to fully exploit the correlations among the\ntriplets. Specifically, we design a vision compositor to fuse reference image\nand target image at first, then the resulted representation will serve two\nroles: (1) counterpart for semantic alignment with the complementary text and\n(2) compensation for the complementary text to boost the explicit relation\nmodeling, thereby implant the implicit relation into the alignment learning.\nOur method is evaluated on two popular datasets, CIRR and FashionIQ, through\nextensive experiments. The results confirm the effectiveness of our\ndual-relation learning in substantially enhancing composed image retrieval\nperformance.\n","authors":["Xintong Jiang","Yaxiong Wang","Yujiao Wu","Meng Wang","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2309.02169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11048v5","updated":"2023-09-05T12:10:51Z","published":"2022-06-22T13:12:54Z","title":"Automated GI tract segmentation using deep learning","summary":" The job of Radiation oncologists is to deliver x-ray beams pointed toward the\ntumor and at the same time avoid the stomach and intestines. With MR-Linacs\n(magnetic resonance imaging and linear accelerator systems), oncologists can\nvisualize the position of the tumor and allow for precise dose according to\ntumor cell presence which can vary from day to day. The current job of\noutlining the position of the stomach and intestines to adjust the X-ray beams\ndirection for the dose delivery to the tumor while avoiding the organs. This is\na time-consuming and labor-intensive process that can easily prolong treatments\nfrom 15 minutes to an hour a day unless deep learning methods can automate the\nsegmentation process. This paper discusses an automated segmentation process\nusing deep learning to make this process faster and allow more patients to get\neffective treatment.\n","authors":["Manhar Sharma"],"pdf_url":"https://arxiv.org/pdf/2206.11048v5.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02165v1","updated":"2023-09-05T12:08:43Z","published":"2023-09-05T12:08:43Z","title":"PCFGaze: Physics-Consistent Feature for Appearance-based Gaze Estimation","summary":" Although recent deep learning based gaze estimation approaches have achieved\nmuch improvement, we still know little about how gaze features are connected to\nthe physics of gaze. In this paper, we try to answer this question by analyzing\nthe gaze feature manifold. Our analysis revealed the insight that the geodesic\ndistance between gaze features is consistent with the gaze differences between\nsamples. According to this finding, we construct the Physics- Consistent\nFeature (PCF) in an analytical way, which connects gaze feature to the physical\ndefinition of gaze. We further propose the PCFGaze framework that directly\noptimizes gaze feature space by the guidance of PCF. Experimental results\ndemonstrate that the proposed framework alleviates the overfitting problem and\nsignificantly improves cross-domain gaze estimation accuracy without extra\ntraining data. The insight of gaze feature has the potential to benefit other\nregression tasks with physical meanings.\n","authors":["Yiwei Bao","Feng Lu"],"pdf_url":"https://arxiv.org/pdf/2309.02165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02159v1","updated":"2023-09-05T11:53:17Z","published":"2023-09-05T11:53:17Z","title":"The Adversarial Implications of Variable-Time Inference","summary":" Machine learning (ML) models are known to be vulnerable to a number of\nattacks that target the integrity of their predictions or the privacy of their\ntraining data. To carry out these attacks, a black-box adversary must typically\npossess the ability to query the model and observe its outputs (e.g., labels).\nIn this work, we demonstrate, for the first time, the ability to enhance such\ndecision-based attacks. To accomplish this, we present an approach that\nexploits a novel side channel in which the adversary simply measures the\nexecution time of the algorithm used to post-process the predictions of the ML\nmodel under attack. The leakage of inference-state elements into algorithmic\ntiming side channels has never been studied before, and we have found that it\ncan contain rich information that facilitates superior timing attacks that\nsignificantly outperform attacks based solely on label outputs. In a case\nstudy, we investigate leakage from the non-maximum suppression (NMS) algorithm,\nwhich plays a crucial role in the operation of object detectors. In our\nexamination of the timing side-channel vulnerabilities associated with this\nalgorithm, we identified the potential to enhance decision-based attacks. We\ndemonstrate attacks against the YOLOv3 detector, leveraging the timing leakage\nto successfully evade object detection using adversarial examples, and perform\ndataset inference. Our experiments show that our adversarial examples exhibit\nsuperior perturbation quality compared to a decision-based attack. In addition,\nwe present a new threat model in which dataset inference based solely on timing\nleakage is performed. To address the timing leakage vulnerability inherent in\nthe NMS algorithm, we explore the potential and limitations of implementing\nconstant-time inference passes as a mitigation strategy.\n","authors":["Dudi Biton","Aditi Misra","Efrat Levy","Jaidip Kotak","Ron Bitton","Roei Schuster","Nicolas Papernot","Yuval Elovici","Ben Nassi"],"pdf_url":"https://arxiv.org/pdf/2309.02159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02158v1","updated":"2023-09-05T11:50:38Z","published":"2023-09-05T11:50:38Z","title":"Traffic Light Recognition using Convolutional Neural Networks: A Survey","summary":" Real-time traffic light recognition is essential for autonomous driving. Yet,\na cohesive overview of the underlying model architectures for this task is\ncurrently missing. In this work, we conduct a comprehensive survey and analysis\nof traffic light recognition methods that use convolutional neural networks\n(CNNs). We focus on two essential aspects: datasets and CNN architectures.\nBased on an underlying architecture, we cluster methods into three major\ngroups: (1) modifications of generic object detectors which compensate for\nspecific task characteristics, (2) multi-stage approaches involving both\nrule-based and CNN components, and (3) task-specific single-stage methods. We\ndescribe the most important works in each cluster, discuss the usage of the\ndatasets, and identify research gaps.\n","authors":["Svetlana Pavlitska","Nico Lambing","Ashok Kumar Bangaru","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2309.02158v1.pdf","comment":"Accepted for publication at ITSC2023"},{"id":"http://arxiv.org/abs/2309.02155v1","updated":"2023-09-05T11:47:51Z","published":"2023-09-05T11:47:51Z","title":"S3C: Semi-Supervised VQA Natural Language Explanation via Self-Critical\n Learning","summary":" VQA Natural Language Explanation (VQA-NLE) task aims to explain the\ndecision-making process of VQA models in natural language. Unlike traditional\nattention or gradient analysis, free-text rationales can be easier to\nunderstand and gain users' trust. Existing methods mostly use post-hoc or\nself-rationalization models to obtain a plausible explanation. However, these\nframeworks are bottlenecked by the following challenges: 1) the reasoning\nprocess cannot be faithfully responded to and suffer from the problem of\nlogical inconsistency. 2) Human-annotated explanations are expensive and\ntime-consuming to collect. In this paper, we propose a new Semi-Supervised\nVQA-NLE via Self-Critical Learning (S3C), which evaluates the candidate\nexplanations by answering rewards to improve the logical consistency between\nanswers and rationales. With a semi-supervised learning framework, the S3C can\nbenefit from a tremendous amount of samples without human-annotated\nexplanations. A large number of automatic measures and human evaluations all\nshow the effectiveness of our method. Meanwhile, the framework achieves a new\nstate-of-the-art performance on the two VQA-NLE datasets.\n","authors":["Wei Suo","Mengyang Sun","Weisong Liu","Yiqi Gao","Peng Wang","Yanning Zhang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2309.02155v1.pdf","comment":"CVPR2023"},{"id":"http://arxiv.org/abs/2309.02150v1","updated":"2023-09-05T11:43:18Z","published":"2023-09-05T11:43:18Z","title":"Domain Adaptation for Satellite-Borne Hyperspectral Cloud Detection","summary":" The advent of satellite-borne machine learning hardware accelerators has\nenabled the on-board processing of payload data using machine learning\ntechniques such as convolutional neural networks (CNN). A notable example is\nusing a CNN to detect the presence of clouds in hyperspectral data captured on\nEarth observation (EO) missions, whereby only clear sky data is downlinked to\nconserve bandwidth. However, prior to deployment, new missions that employ new\nsensors will not have enough representative datasets to train a CNN model,\nwhile a model trained solely on data from previous missions will underperform\nwhen deployed to process the data on the new missions. This underperformance\nstems from the domain gap, i.e., differences in the underlying distributions of\nthe data generated by the different sensors in previous and future missions. In\nthis paper, we address the domain gap problem in the context of on-board\nhyperspectral cloud detection. Our main contributions lie in formulating new\ndomain adaptation tasks that are motivated by a concrete EO mission, developing\na novel algorithm for bandwidth-efficient supervised domain adaptation, and\ndemonstrating test-time adaptation algorithms on space deployable neural\nnetwork accelerators. Our contributions enable minimal data transmission to be\ninvoked (e.g., only 1% of the weights in ResNet50) to achieve domain\nadaptation, thereby allowing more sophisticated CNN models to be deployed and\nupdated on satellites without being hampered by domain gap and bandwidth\nlimitations.\n","authors":["Andrew Du","Anh-Dzung Doan","Yee Wei Law","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2309.02150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.04449v2","updated":"2023-09-05T11:42:07Z","published":"2022-05-09T17:51:44Z","title":"Introspective Deep Metric Learning for Image Retrieval","summary":" This paper proposes an introspective deep metric learning (IDML) framework\nfor uncertainty-aware comparisons of images. Conventional deep metric learning\nmethods produce confident semantic distances between images regardless of the\nuncertainty level. However, we argue that a good similarity model should\nconsider the semantic discrepancies with caution to better deal with ambiguous\nimages for more robust training. To achieve this, we propose to represent an\nimage using not only a semantic embedding but also an accompanying uncertainty\nembedding, which describes the semantic characteristics and ambiguity of an\nimage, respectively. We further propose an introspective similarity metric to\nmake similarity judgments between images considering both their semantic\ndifferences and ambiguities. The proposed IDML framework improves the\nperformance of deep metric learning through uncertainty modeling and attains\nstate-of-the-art results on the widely used CUB-200-2011, Cars196, and Stanford\nOnline Products datasets for image retrieval and clustering. We further provide\nan in-depth analysis of our framework to demonstrate the effectiveness and\nreliability of IDML. Code is available at: https://github.com/wzzheng/IDML.\n","authors":["Wenzhao Zheng","Chengkun Wang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2205.04449v2.pdf","comment":"The extended version of this paper is accepted to T-PAMI. Source code\n available at https://github.com/wzzheng/IDML"},{"id":"http://arxiv.org/abs/2309.02147v1","updated":"2023-09-05T11:39:29Z","published":"2023-09-05T11:39:29Z","title":"INCEPTNET: Precise And Early Disease Detection Application For Medical\n Images Analyses","summary":" In view of the recent paradigm shift in deep AI based image processing\nmethods, medical image processing has advanced considerably. In this study, we\npropose a novel deep neural network (DNN), entitled InceptNet, in the scope of\nmedical image processing, for early disease detection and segmentation of\nmedical images in order to enhance precision and performance. We also\ninvestigate the interaction of users with the InceptNet application to present\na comprehensive application including the background processes, and foreground\ninteractions with users. Fast InceptNet is shaped by the prominent Unet\narchitecture, and it seizes the power of an Inception module to be fast and\ncost effective while aiming to approximate an optimal local sparse structure.\nAdding Inception modules with various parallel kernel sizes can improve the\nnetwork's ability to capture the variations in the scaled regions of interest.\nTo experiment, the model is tested on four benchmark datasets, including retina\nblood vessel segmentation, lung nodule segmentation, skin lesion segmentation,\nand breast cancer cell detection. The improvement was more significant on\nimages with small scale structures. The proposed method improved the accuracy\nfrom 0.9531, 0.8900, 0.9872, and 0.9881 to 0.9555, 0.9510, 0.9945, and 0.9945\non the mentioned datasets, respectively, which show outperforming of the\nproposed method over the previous works. Furthermore, by exploring the\nprocedure from start to end, individuals who have utilized a trial edition of\nInceptNet, in the form of a complete application, are presented with thirteen\nmultiple choice questions in order to assess the proposed method. The outcomes\nare evaluated through the means of Human Computer Interaction.\n","authors":["Amirhossein Sajedi","Mohammad Javad Fadaeieslam"],"pdf_url":"https://arxiv.org/pdf/2309.02147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09244v2","updated":"2023-09-05T11:38:10Z","published":"2023-08-18T02:11:01Z","title":"SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera\n Videos","summary":" Camera-based 3D object detection in BEV (Bird's Eye View) space has drawn\ngreat attention over the past few years. Dense detectors typically follow a\ntwo-stage pipeline by first constructing a dense BEV feature and then\nperforming object detection in BEV space, which suffers from complex view\ntransformations and high computation cost. On the other side, sparse detectors\nfollow a query-based paradigm without explicit dense BEV feature construction,\nbut achieve worse performance than the dense counterparts. In this paper, we\nfind that the key to mitigate this performance gap is the adaptability of the\ndetector in both BEV and image space. To achieve this goal, we propose\nSparseBEV, a fully sparse 3D object detector that outperforms the dense\ncounterparts. SparseBEV contains three key designs, which are (1)\nscale-adaptive self attention to aggregate features with adaptive receptive\nfield in BEV space, (2) adaptive spatio-temporal sampling to generate sampling\nlocations under the guidance of queries, and (3) adaptive mixing to decode the\nsampled features with dynamic weights from the queries. On the test split of\nnuScenes, SparseBEV achieves the state-of-the-art performance of 67.5 NDS. On\nthe val split, SparseBEV achieves 55.8 NDS while maintaining a real-time\ninference speed of 23.5 FPS. Code is available at\nhttps://github.com/MCG-NJU/SparseBEV.\n","authors":["Haisong Liu","Yao Teng","Tao Lu","Haiguang Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09244v2.pdf","comment":"Accepted to ICCV 2023. This version fixes some typos"},{"id":"http://arxiv.org/abs/2309.02140v1","updated":"2023-09-05T11:30:38Z","published":"2023-09-05T11:30:38Z","title":"A Lightweight, Rapid and Efficient Deep Convolutional Network for Chest\n X-Ray Tuberculosis Detection","summary":" Tuberculosis (TB) is still recognized as one of the leading causes of death\nworldwide. Recent advances in deep learning (DL) have shown to enhance\nradiologists' ability to interpret chest X-ray (CXR) images accurately and with\nfewer errors, leading to a better diagnosis of this disease. However, little\nwork has been done to develop models capable of diagnosing TB that offer good\nperformance while being efficient, fast and computationally inexpensive. In\nthis work, we propose LightTBNet, a novel lightweight, fast and efficient deep\nconvolutional network specially customized to detect TB from CXR images. Using\na total of 800 frontal CXR images from two publicly available datasets, our\nsolution yielded an accuracy, F1 and area under the ROC curve (AUC) of 0.906,\n0.907 and 0.961, respectively, on an independent test subset. The proposed\nmodel demonstrates outstanding performance while delivering a rapid prediction,\nwith minimal computational and memory requirements, making it highly suitable\nfor deployment in handheld devices that can be used in low-resource areas with\nhigh TB prevalence. Code publicly available at\nhttps://github.com/dani-capellan/LightTBNet.\n","authors":["Daniel Capellán-Martín","Juan J. Gómez-Valverde","David Bermejo-Peláez","María J. Ledesma-Carbayo"],"pdf_url":"https://arxiv.org/pdf/2309.02140v1.pdf","comment":"5 pages, 3 figures, 3 tables. This paper has been accepted at ISBI\n 2023"},{"id":"http://arxiv.org/abs/2309.02139v1","updated":"2023-09-05T11:29:30Z","published":"2023-09-05T11:29:30Z","title":"Self-Supervised Pre-Training Boosts Semantic Scene Segmentation on LiDAR\n data","summary":" Airborne LiDAR systems have the capability to capture the Earth's surface by\ngenerating extensive point cloud data comprised of points mainly defined by 3D\ncoordinates. However, labeling such points for supervised learning tasks is\ntime-consuming. As a result, there is a need to investigate techniques that can\nlearn from unlabeled data to significantly reduce the number of annotated\nsamples. In this work, we propose to train a self-supervised encoder with\nBarlow Twins and use it as a pre-trained network in the task of semantic scene\nsegmentation. The experimental results demonstrate that our unsupervised\npre-training boosts performance once fine-tuned on the supervised task,\nespecially for under-represented categories.\n","authors":["Mariona Carós","Ariadna Just","Santi Seguí","Jordi Vitrià"],"pdf_url":"https://arxiv.org/pdf/2309.02139v1.pdf","comment":"International conference Machine Vision Applications 2023"},{"id":"http://arxiv.org/abs/2309.02120v1","updated":"2023-09-05T10:56:23Z","published":"2023-09-05T10:56:23Z","title":"Multi-label affordance mapping from egocentric vision","summary":" Accurate affordance detection and segmentation with pixel precision is an\nimportant piece in many complex systems based on interactions, such as robots\nand assitive devices. We present a new approach to affordance perception which\nenables accurate multi-label segmentation. Our approach can be used to\nautomatically extract grounded affordances from first person videos of\ninteractions using a 3D map of the environment providing pixel level precision\nfor the affordance location. We use this method to build the largest and most\ncomplete dataset on affordances based on the EPIC-Kitchen dataset, EPIC-Aff,\nwhich provides interaction-grounded, multi-label, metric and spatial affordance\nannotations. Then, we propose a new approach to affordance segmentation based\non multi-label detection which enables multiple affordances to co-exists in the\nsame space, for example if they are associated with the same object. We present\nseveral strategies of multi-label detection using several segmentation\narchitectures. The experimental results highlight the importance of the\nmulti-label detection. Finally, we show how our metric representation can be\nexploited for build a map of interaction hotspots in spatial action-centric\nzones and use that representation to perform a task-oriented navigation.\n","authors":["Lorenzo Mur-Labadia","Jose J. Guerrero","Ruben Martinez-Cantin"],"pdf_url":"https://arxiv.org/pdf/2309.02120v1.pdf","comment":"International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2309.02119v1","updated":"2023-09-05T10:52:21Z","published":"2023-09-05T10:52:21Z","title":"Hierarchical Masked 3D Diffusion Model for Video Outpainting","summary":" Video outpainting aims to adequately complete missing areas at the edges of\nvideo frames. Compared to image outpainting, it presents an additional\nchallenge as the model should maintain the temporal consistency of the filled\narea. In this paper, we introduce a masked 3D diffusion model for video\noutpainting. We use the technique of mask modeling to train the 3D diffusion\nmodel. This allows us to use multiple guide frames to connect the results of\nmultiple video clip inferences, thus ensuring temporal consistency and reducing\njitter between adjacent frames. Meanwhile, we extract the global frames of the\nvideo as prompts and guide the model to obtain information other than the\ncurrent video clip using cross-attention. We also introduce a hybrid\ncoarse-to-fine inference pipeline to alleviate the artifact accumulation\nproblem. The existing coarse-to-fine pipeline only uses the infilling strategy,\nwhich brings degradation because the time interval of the sparse frames is too\nlarge. Our pipeline benefits from bidirectional learning of the mask modeling\nand thus can employ a hybrid strategy of infilling and interpolation when\ngenerating sparse frames. Experiments show that our method achieves\nstate-of-the-art results in video outpainting tasks. More results are provided\nat our https://fanfanda.github.io/M3DDM/.\n","authors":["Fanda Fan","Chaoxu Guo","Litong Gong","Biao Wang","Tiezheng Ge","Yuning Jiang","Chunjie Luo","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.02119v1.pdf","comment":"ACM MM 2023 accepted"},{"id":"http://arxiv.org/abs/2309.02102v1","updated":"2023-09-05T10:21:37Z","published":"2023-09-05T10:21:37Z","title":"Iterative Superquadric Recomposition of 3D Objects from Multiple Views","summary":" Humans are good at recomposing novel objects, i.e. they can identify\ncommonalities between unknown objects from general structure to finer detail,\nan ability difficult to replicate by machines. We propose a framework, ISCO, to\nrecompose an object using 3D superquadrics as semantic parts directly from 2D\nviews without training a model that uses 3D supervision. To achieve this, we\noptimize the superquadric parameters that compose a specific instance of the\nobject, comparing its rendered 3D view and 2D image silhouette. Our ISCO\nframework iteratively adds new superquadrics wherever the reconstruction error\nis high, abstracting first coarse regions and then finer details of the target\nobject. With this simple coarse-to-fine inductive bias, ISCO provides\nconsistent superquadrics for related object parts, despite not having any\nsemantic supervision. Since ISCO does not train any neural network, it is also\ninherently robust to out-of-distribution objects. Experiments show that,\ncompared to recent single instance superquadrics reconstruction approaches,\nISCO provides consistently more accurate 3D reconstructions, even from images\nin the wild. Code available at https://github.com/ExplainableML/ISCO .\n","authors":["Stephan Alaniz","Massimiliano Mancini","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2309.02102v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02099v1","updated":"2023-09-05T10:08:11Z","published":"2023-09-05T10:08:11Z","title":"Towards Diverse and Consistent Typography Generation","summary":" In this work, we consider the typography generation task that aims at\nproducing diverse typographic styling for the given graphic document. We\nformulate typography generation as a fine-grained attribute generation for\nmultiple text elements and build an autoregressive model to generate diverse\ntypography that matches the input design context. We further propose a simple\nyet effective sampling approach that respects the consistency and distinction\nprinciple of typography so that generated examples share consistent typographic\nstyling across text elements. Our empirical study shows that our model\nsuccessfully generates diverse typographic designs while preserving a\nconsistent typographic structure.\n","authors":["Wataru Shimoda","Daichi Haraguchi","Seiichi Uchida","Kota Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2309.02099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13148v2","updated":"2023-09-05T10:05:19Z","published":"2023-03-23T10:03:12Z","title":"Calibrated Out-of-Distribution Detection with a Generic Representation","summary":" Out-of-distribution detection is a common issue in deploying vision models in\npractice and solving it is an essential building block in safety critical\napplications. Most of the existing OOD detection solutions focus on improving\nthe OOD robustness of a classification model trained exclusively on\nin-distribution (ID) data. In this work, we take a different approach and\npropose to leverage generic pre-trained representation. We propose a novel OOD\nmethod, called GROOD, that formulates the OOD detection as a Neyman-Pearson\ntask with well calibrated scores and which achieves excellent performance,\npredicated by the use of a good generic representation. Only a trivial training\nprocess is required for adapting GROOD to a particular problem. The method is\nsimple, general, efficient, calibrated and with only a few hyper-parameters.\nThe method achieves state-of-the-art performance on a number of OOD benchmarks,\nreaching near perfect performance on several of them. The source code is\navailable at https://github.com/vojirt/GROOD.\n","authors":["Tomas Vojir","Jan Sochman","Rahaf Aljundi","Jiri Matas"],"pdf_url":"https://arxiv.org/pdf/2303.13148v2.pdf","comment":"10 pages, accepted to Workshop on Uncertainty Quantification for\n Computer Vision, ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11917v2","updated":"2023-09-05T09:56:23Z","published":"2023-08-23T05:03:06Z","title":"LFS-GAN: Lifelong Few-Shot Image Generation","summary":" We address a challenging lifelong few-shot image generation task for the\nfirst time. In this situation, a generative model learns a sequence of tasks\nusing only a few samples per task. Consequently, the learned model encounters\nboth catastrophic forgetting and overfitting problems at a time. Existing\nstudies on lifelong GANs have proposed modulation-based methods to prevent\ncatastrophic forgetting. However, they require considerable additional\nparameters and cannot generate high-fidelity and diverse images from limited\ndata. On the other hand, the existing few-shot GANs suffer from severe\ncatastrophic forgetting when learning multiple tasks. To alleviate these\nissues, we propose a framework called Lifelong Few-Shot GAN (LFS-GAN) that can\ngenerate high-quality and diverse images in lifelong few-shot image generation\ntask. Our proposed framework learns each task using an efficient task-specific\nmodulator - Learnable Factorized Tensor (LeFT). LeFT is rank-constrained and\nhas a rich representation ability due to its unique reconstruction technique.\nFurthermore, we propose a novel mode seeking loss to improve the diversity of\nour model in low-data circumstances. Extensive experiments demonstrate that the\nproposed LFS-GAN can generate high-fidelity and diverse images without any\nforgetting and mode collapse in various domains, achieving state-of-the-art in\nlifelong few-shot image generation task. Surprisingly, we find that our LFS-GAN\neven outperforms the existing few-shot GANs in the few-shot image generation\ntask. The code is available at Github.\n","authors":["Juwon Seo","Ji-Su Kang","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2308.11917v2.pdf","comment":"20 pages, 19 figures, 14 tables, ICCV 2023 Poster"},{"id":"http://arxiv.org/abs/2309.02091v1","updated":"2023-09-05T09:54:26Z","published":"2023-09-05T09:54:26Z","title":"DeNISE: Deep Networks for Improved Segmentation Edges","summary":" This paper presents Deep Networks for Improved Segmentation Edges (DeNISE), a\nnovel data enhancement technique using edge detection and segmentation models\nto improve the boundary quality of segmentation masks. DeNISE utilizes the\ninherent differences in two sequential deep neural architectures to improve the\naccuracy of the predicted segmentation edge. DeNISE applies to all types of\nneural networks and is not trained end-to-end, allowing rapid experiments to\ndiscover which models complement each other. We test and apply DeNISE for\nbuilding segmentation in aerial images. Aerial images are known for difficult\nconditions as they have a low resolution with optical noise, such as\nreflections, shadows, and visual obstructions. Overall the paper demonstrates\nthe potential for DeNISE. Using the technique, we improve the baseline results\nwith a building IoU of 78.9%.\n","authors":["Sander Riisøen Jyhne","Per-Arne Andersen","Morten Goodwin"],"pdf_url":"https://arxiv.org/pdf/2309.02091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02088v1","updated":"2023-09-05T09:50:31Z","published":"2023-09-05T09:50:31Z","title":"Dual Adversarial Alignment for Realistic Support-Query Shift Few-shot\n Learning","summary":" Support-query shift few-shot learning aims to classify unseen examples (query\nset) to labeled data (support set) based on the learned embedding in a\nlow-dimensional space under a distribution shift between the support set and\nthe query set. However, in real-world scenarios the shifts are usually unknown\nand varied, making it difficult to estimate in advance. Therefore, in this\npaper, we propose a novel but more difficult challenge, RSQS, focusing on\nRealistic Support-Query Shift few-shot learning. The key feature of RSQS is\nthat the individual samples in a meta-task are subjected to multiple\ndistribution shifts in each meta-task. In addition, we propose a unified\nadversarial feature alignment method called DUal adversarial ALignment\nframework (DuaL) to relieve RSQS from two aspects, i.e., inter-domain bias and\nintra-domain variance. On the one hand, for the inter-domain bias, we corrupt\nthe original data in advance and use the synthesized perturbed inputs to train\nthe repairer network by minimizing distance in the feature level. On the other\nhand, for intra-domain variance, we proposed a generator network to synthesize\nhard, i.e., less similar, examples from the support set in a self-supervised\nmanner and introduce regularized optimal transportation to derive a smooth\noptimal transportation plan. Lastly, a benchmark of RSQS is built with several\nstate-of-the-art baselines among three datasets (CIFAR100, mini-ImageNet, and\nTiered-Imagenet). Experiment results show that DuaL significantly outperforms\nthe state-of-the-art methods in our benchmark.\n","authors":["Siyang Jiang","Rui Fang","Hsi-Wen Chen","Wei Ding","Ming-Syan Chen"],"pdf_url":"https://arxiv.org/pdf/2309.02088v1.pdf","comment":"Best student paper in PAKDD 2022"},{"id":"http://arxiv.org/abs/2307.16489v2","updated":"2023-09-05T09:43:40Z","published":"2023-07-31T08:34:24Z","title":"BAGM: A Backdoor Attack for Manipulating Text-to-Image Generative Models","summary":" The rise in popularity of text-to-image generative artificial intelligence\n(AI) has attracted widespread public interest. We demonstrate that this\ntechnology can be attacked to generate content that subtly manipulates its\nusers. We propose a Backdoor Attack on text-to-image Generative Models (BAGM),\nwhich upon triggering, infuses the generated images with manipulative details\nthat are naturally blended in the content. Our attack is the first to target\nthree popular text-to-image generative models across three stages of the\ngenerative process by modifying the behaviour of the embedded tokenizer, the\nlanguage model or the image generative model. Based on the penetration level,\nBAGM takes the form of a suite of attacks that are referred to as surface,\nshallow and deep attacks in this article. Given the existing gap within this\ndomain, we also contribute a comprehensive set of quantitative metrics designed\nspecifically for assessing the effectiveness of backdoor attacks on\ntext-to-image models. The efficacy of BAGM is established by attacking\nstate-of-the-art generative models, using a marketing scenario as the target\ndomain. To that end, we contribute a dataset of branded product images. Our\nembedded backdoors increase the bias towards the target outputs by more than\nfive times the usual, without compromising the model robustness or the\ngenerated content utility. By exposing generative AI's vulnerabilities, we\nencourage researchers to tackle these challenges and practitioners to exercise\ncaution when using pre-trained models. Relevant code, input prompts and\nsupplementary material can be found at https://github.com/JJ-Vice/BAGM, and the\ndataset is available at:\nhttps://ieee-dataport.org/documents/marketable-foods-mf-dataset.\n Keywords: Generative Artificial Intelligence, Generative Models,\nText-to-Image generation, Backdoor Attacks, Trojan, Stable Diffusion.\n","authors":["Jordan Vice","Naveed Akhtar","Richard Hartley","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.16489v2.pdf","comment":"This research was supported by National Intelligence and Security\n Discovery Research Grants (project# NS220100007), funded by the Department of\n Defence Australia"},{"id":"http://arxiv.org/abs/2306.05029v2","updated":"2023-09-05T09:43:02Z","published":"2023-06-08T08:29:10Z","title":"Multi-level Multiple Instance Learning with Transformer for Whole Slide\n Image Classification","summary":" Whole slide image (WSI) refers to a type of high-resolution scanned tissue\nimage, which is extensively employed in computer-assisted diagnosis (CAD). The\nextremely high resolution and limited availability of region-level annotations\nmake employing deep learning methods for WSI-based digital diagnosis\nchallenging. Recently integrating multiple instance learning (MIL) and\nTransformer for WSI analysis shows very promising results. However, designing\neffective Transformers for this weakly-supervised high-resolution image\nanalysis is an underexplored yet important problem. In this paper, we propose a\nMulti-level MIL (MMIL) scheme by introducing a hierarchical structure to MIL,\nwhich enables efficient handling of MIL tasks involving a large number of\ninstances. Based on MMIL, we instantiated MMIL-Transformer, an efficient\nTransformer model with windowed exact self-attention for large-scale MIL tasks.\nTo validate its effectiveness, we conducted a set of experiments on WSI\nclassification tasks, where MMIL-Transformer demonstrate superior performance\ncompared to existing state-of-the-art methods, i.e., 96.80% test AUC and 97.67%\ntest accuracy on the CAMELYON16 dataset, 99.04% test AUC and 94.37% test\naccuracy on the TCGA-NSCLC dataset, respectively. All code and pre-trained\nmodels are available at: https://github.com/hustvl/MMIL-Transformer\n","authors":["Ruijie Zhang","Qiaozhe Zhang","Yingzhuang Liu","Hao Xin","Yan Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2306.05029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02067v1","updated":"2023-09-05T09:11:18Z","published":"2023-09-05T09:11:18Z","title":"Histograms of Points, Orientations, and Dynamics of Orientations\n Features for Hindi Online Handwritten Character Recognition","summary":" A set of features independent of character stroke direction and order\nvariations is proposed for online handwritten character recognition. A method\nis developed that maps features like co-ordinates of points, orientations of\nstrokes at points, and dynamics of orientations of strokes at points spatially\nas a function of co-ordinate values of the points and computes histograms of\nthese features from different regions in the spatial map.\n Different features like spatio-temporal, discrete Fourier transform, discrete\ncosine transform, discrete wavelet transform, spatial, and histograms of\noriented gradients used in other studies for training classifiers for character\nrecognition are considered. The classifier chosen for classification\nperformance comparison, when trained with different features, is support vector\nmachines (SVM).\n The character datasets used for training and testing the classifiers consist\nof online handwritten samples of 96 different Hindi characters. There are 12832\nand 2821 samples in training and testing datasets, respectively.\n SVM classifiers trained with the proposed features has the highest\nclassification accuracy of 92.9\\% when compared to the performances of SVM\nclassifiers trained with the other features and tested on the same testing\ndataset. Therefore, the proposed features have better character discriminative\ncapability than the other features considered for comparison.\n","authors":["Anand Sharma","A. G. Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2309.02067v1.pdf","comment":"21 pages, 12 jpg figures"},{"id":"http://arxiv.org/abs/2308.16568v2","updated":"2023-09-05T09:09:51Z","published":"2023-08-31T09:02:53Z","title":"Shape of my heart: Cardiac models through learned signed distance\n functions","summary":" The efficient construction of an anatomical model is one of the major\nchallenges of patient-specific in-silico models of the human heart. Current\nmethods frequently rely on linear statistical models, allowing no advanced\ntopological changes, or requiring medical image segmentation followed by a\nmeshing pipeline, which strongly depends on image resolution, quality, and\nmodality. These approaches are therefore limited in their transferability to\nother imaging domains. In this work, the cardiac shape is reconstructed by\nmeans of three-dimensional deep signed distance functions with Lipschitz\nregularity. For this purpose, the shapes of cardiac MRI reconstructions are\nlearned from public databases to model the spatial relation of multiple\nchambers in Cartesian space. We demonstrate that this approach is also capable\nof reconstructing anatomical models from partial data, such as point clouds\nfrom a single ventricle, or modalities different from the trained MRI, such as\nelectroanatomical mapping, and in addition, allows us to generate new\nanatomical shapes by randomly sampling latent vectors.\n","authors":["Jan Verhülsdonk","Thomas Grandits","Francisco Sahli Costabal","Rolf Krause","Angelo Auricchio","Gundolf Haase","Simone Pezzuto","Alexander Effland"],"pdf_url":"https://arxiv.org/pdf/2308.16568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.08569v3","updated":"2023-09-05T09:05:15Z","published":"2022-07-18T12:53:53Z","title":"Multi-manifold Attention for Vision Transformers","summary":" Vision Transformers are very popular nowadays due to their state-of-the-art\nperformance in several computer vision tasks, such as image classification and\naction recognition. Although their performance has been greatly enhanced\nthrough highly descriptive patch embeddings and hierarchical structures, there\nis still limited research on utilizing additional data representations so as to\nrefine the selfattention map of a Transformer. To address this problem, a novel\nattention mechanism, called multi-manifold multihead attention, is proposed in\nthis work to substitute the vanilla self-attention of a Transformer. The\nproposed mechanism models the input space in three distinct manifolds, namely\nEuclidean, Symmetric Positive Definite and Grassmann, thus leveraging different\nstatistical and geometrical properties of the input for the computation of a\nhighly descriptive attention map. In this way, the proposed attention mechanism\ncan guide a Vision Transformer to become more attentive towards important\nappearance, color and texture features of an image, leading to improved\nclassification and segmentation results, as shown by the experimental results\non well-known datasets.\n","authors":["Dimitrios Konstantinidis","Ilias Papastratis","Kosmas Dimitropoulos","Petros Daras"],"pdf_url":"https://arxiv.org/pdf/2207.08569v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.02054v1","updated":"2023-09-05T08:56:20Z","published":"2023-09-05T08:56:20Z","title":"An Adaptive Spatial-Temporal Local Feature Difference Method for\n Infrared Small-moving Target Detection","summary":" Detecting small moving targets accurately in infrared (IR) image sequences is\na significant challenge. To address this problem, we propose a novel method\ncalled spatial-temporal local feature difference (STLFD) with adaptive\nbackground suppression (ABS). Our approach utilizes filters in the spatial and\ntemporal domains and performs pixel-level ABS on the output to enhance the\ncontrast between the target and the background. The proposed method comprises\nthree steps. First, we obtain three temporal frame images based on the current\nframe image and extract two feature maps using the designed spatial domain and\ntemporal domain filters. Next, we fuse the information of the spatial domain\nand temporal domain to produce the spatial-temporal feature maps and suppress\nnoise using our pixel-level ABS module. Finally, we obtain the segmented binary\nmap by applying a threshold. Our experimental results demonstrate that the\nproposed method outperforms existing state-of-the-art methods for infrared\nsmall-moving target detection.\n","authors":["Yongkang Zhao","Chuang Zhu","Yuan Li","Shuaishuai Wang","Zihan Lan","Yuanyuan Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.02054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02049v1","updated":"2023-09-05T08:49:53Z","published":"2023-09-05T08:49:53Z","title":"Diffusion-based 3D Object Detection with Random Boxes","summary":" 3D object detection is an essential task for achieving autonomous driving.\nExisting anchor-based detection methods rely on empirical heuristics setting of\nanchors, which makes the algorithms lack elegance. In recent years, we have\nwitnessed the rise of several generative models, among which diffusion models\nshow great potential for learning the transformation of two distributions. Our\nproposed Diff3Det migrates the diffusion model to proposal generation for 3D\nobject detection by considering the detection boxes as generative targets.\nDuring training, the object boxes diffuse from the ground truth boxes to the\nGaussian distribution, and the decoder learns to reverse this noise process. In\nthe inference stage, the model progressively refines a set of random boxes to\nthe prediction results. We provide detailed experiments on the KITTI benchmark\nand achieve promising performance compared to classical anchor-based 3D\ndetection methods.\n","authors":["Xin Zhou","Jinghua Hou","Tingting Yao","Dingkang Liang","Zhe Liu","Zhikang Zou","Xiaoqing Ye","Jianwei Cheng","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2309.02049v1.pdf","comment":"Accepted by PRCV 2023"},{"id":"http://arxiv.org/abs/2309.02043v1","updated":"2023-09-05T08:37:58Z","published":"2023-09-05T08:37:58Z","title":"Decomposed Guided Dynamic Filters for Efficient RGB-Guided Depth\n Completion","summary":" RGB-guided depth completion aims at predicting dense depth maps from sparse\ndepth measurements and corresponding RGB images, where how to effectively and\nefficiently exploit the multi-modal information is a key issue. Guided dynamic\nfilters, which generate spatially-variant depth-wise separable convolutional\nfilters from RGB features to guide depth features, have been proven to be\neffective in this task. However, the dynamically generated filters require\nmassive model parameters, computational costs and memory footprints when the\nnumber of feature channels is large. In this paper, we propose to decompose the\nguided dynamic filters into a spatially-shared component multiplied by\ncontent-adaptive adaptors at each spatial location. Based on the proposed idea,\nwe introduce two decomposition schemes A and B, which decompose the filters by\nsplitting the filter structure and using spatial-wise attention, respectively.\nThe decomposed filters not only maintain the favorable properties of guided\ndynamic filters as being content-dependent and spatially-variant, but also\nreduce model parameters and hardware costs, as the learned adaptors are\ndecoupled with the number of feature channels. Extensive experimental results\ndemonstrate that the methods using our schemes outperform state-of-the-art\nmethods on the KITTI dataset, and rank 1st and 2nd on the KITTI benchmark at\nthe time of submission. Meanwhile, they also achieve comparable performance on\nthe NYUv2 dataset. In addition, our proposed methods are general and could be\nemployed as plug-and-play feature fusion blocks in other multi-modal fusion\ntasks such as RGB-D salient object detection.\n","authors":["Yufei Wang","Yuxin Mao","Qi Liu","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2309.02043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02041v1","updated":"2023-09-05T08:34:23Z","published":"2023-09-05T08:34:23Z","title":"Learning Cross-Modal Affinity for Referring Video Object Segmentation\n Targeting Limited Samples","summary":" Referring video object segmentation (RVOS), as a supervised learning task,\nrelies on sufficient annotated data for a given scene. However, in more\nrealistic scenarios, only minimal annotations are available for a new scene,\nwhich poses significant challenges to existing RVOS methods. With this in mind,\nwe propose a simple yet effective model with a newly designed cross-modal\naffinity (CMA) module based on a Transformer architecture. The CMA module\nbuilds multimodal affinity with a few samples, thus quickly learning new\nsemantic information, and enabling the model to adapt to different scenarios.\nSince the proposed method targets limited samples for new scenes, we generalize\nthe problem as - few-shot referring video object segmentation (FS-RVOS). To\nfoster research in this direction, we build up a new FS-RVOS benchmark based on\ncurrently available datasets. The benchmark covers a wide range and includes\nmultiple situations, which can maximally simulate real-world scenarios.\nExtensive experiments show that our model adapts well to different scenarios\nwith only a few samples, reaching state-of-the-art performance on the\nbenchmark. On Mini-Ref-YouTube-VOS, our model achieves an average performance\nof 53.1 J and 54.8 F, which are 10% better than the baselines. Furthermore, we\nshow impressive results of 77.7 J and 74.8 F on Mini-Ref-SAIL-VOS, which are\nsignificantly better than the baselines. Code is publicly available at\nhttps://github.com/hengliusky/Few_shot_RVOS.\n","authors":["Guanghui Li","Mingqi Gao","Heng Liu","Xiantong Zhen","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.02041v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2309.02031v1","updated":"2023-09-05T08:21:16Z","published":"2023-09-05T08:21:16Z","title":"A survey on efficient vision transformers: algorithms, techniques, and\n performance benchmarking","summary":" Vision Transformer (ViT) architectures are becoming increasingly popular and\nwidely employed to tackle computer vision applications. Their main feature is\nthe capacity to extract global information through the self-attention\nmechanism, outperforming earlier convolutional neural networks. However, ViT\ndeployment and performance have grown steadily with their size, number of\ntrainable parameters, and operations. Furthermore, self-attention's\ncomputational and memory cost quadratically increases with the image\nresolution. Generally speaking, it is challenging to employ these architectures\nin real-world applications due to many hardware and environmental restrictions,\nsuch as processing and computational capabilities. Therefore, this survey\ninvestigates the most efficient methodologies to ensure sub-optimal estimation\nperformances. More in detail, four efficient categories will be analyzed:\ncompact architecture, pruning, knowledge distillation, and quantization\nstrategies. Moreover, a new metric called Efficient Error Rate has been\nintroduced in order to normalize and compare models' features that affect\nhardware devices at inference time, such as the number of parameters, bits,\nFLOPs, and model size. Summarizing, this paper firstly mathematically defines\nthe strategies used to make Vision Transformer efficient, describes and\ndiscusses state-of-the-art methodologies, and analyzes their performances over\ndifferent application scenarios. Toward the end of this paper, we also discuss\nopen challenges and promising research directions.\n","authors":["Lorenzo Papa","Paolo Russo","Irene Amerini","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.02031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02022v1","updated":"2023-09-05T08:00:01Z","published":"2023-09-05T08:00:01Z","title":"Dynamic Early Exiting Predictive Coding Neural Networks","summary":" Internet of Things (IoT) sensors are nowadays heavily utilized in various\nreal-world applications ranging from wearables to smart buildings passing by\nagrotechnology and health monitoring. With the huge amounts of data generated\nby these tiny devices, Deep Learning (DL) models have been extensively used to\nenhance them with intelligent processing. However, with the urge for smaller\nand more accurate devices, DL models became too heavy to deploy. It is thus\nnecessary to incorporate the hardware's limited resources in the design\nprocess. Therefore, inspired by the human brain known for its efficiency and\nlow power consumption, we propose a shallow bidirectional network based on\npredictive coding theory and dynamic early exiting for halting further\ncomputations when a performance threshold is surpassed. We achieve comparable\naccuracy to VGG-16 in image classification on CIFAR-10 with fewer parameters\nand less computational complexity.\n","authors":["Alaa Zniber","Ouassim Karrakchou","Mounir Ghogho"],"pdf_url":"https://arxiv.org/pdf/2309.02022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02020v1","updated":"2023-09-05T07:58:21Z","published":"2023-09-05T07:58:21Z","title":"RawHDR: High Dynamic Range Image Reconstruction from a Single Raw Image","summary":" High dynamic range (HDR) images capture much more intensity levels than\nstandard ones. Current methods predominantly generate HDR images from 8-bit low\ndynamic range (LDR) sRGB images that have been degraded by the camera\nprocessing pipeline. However, it becomes a formidable task to retrieve\nextremely high dynamic range scenes from such limited bit-depth data. Unlike\nexisting methods, the core idea of this work is to incorporate more informative\nRaw sensor data to generate HDR images, aiming to recover scene information in\nhard regions (the darkest and brightest areas of an HDR scene). To this end, we\npropose a model tailor-made for Raw images, harnessing the unique features of\nRaw data to facilitate the Raw-to-HDR mapping. Specifically, we learn exposure\nmasks to separate the hard and easy regions of a high dynamic scene. Then, we\nintroduce two important guidances, dual intensity guidance, which guides less\ninformative channels with more informative ones, and global spatial guidance,\nwhich extrapolates scene specifics over an extended spatial domain. To verify\nour Raw-to-HDR approach, we collect a large Raw/HDR paired dataset for both\ntraining and testing. Our empirical evaluations validate the superiority of the\nproposed Raw-to-HDR reconstruction model, as well as our newly captured dataset\nin the experiments.\n","authors":["Yunhao Zou","Chenggang Yan","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2309.02020v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02007v1","updated":"2023-09-05T07:45:35Z","published":"2023-09-05T07:45:35Z","title":"Logarithmic Mathematical Morphology: theory and applications","summary":" Classically, in Mathematical Morphology, an image (i.e., a grey-level\nfunction) is analysed by another image which is named the structuring element\nor the structuring function. This structuring function is moved over the image\ndomain and summed to the image. However, in an image presenting lighting\nvariations, the analysis by a structuring function should require that its\namplitude varies according to the image intensity. Such a property is not\nverified in Mathematical Morphology for grey level functions, when the\nstructuring function is summed to the image with the usual additive law. In\norder to address this issue, a new framework is defined with an additive law\nfor which the amplitude of the structuring function varies according to the\nimage amplitude. This additive law is chosen within the Logarithmic Image\nProcessing framework and models the lighting variations with a physical cause\nsuch as a change of light intensity or a change of camera exposure-time. The\nnew framework is named Logarithmic Mathematical Morphology (LMM) and allows the\ndefinition of operators which are robust to such lighting variations. In images\nwith uniform lighting variations, those new LMM operators perform better than\nusual morphological operators. In eye-fundus images with non-uniform lighting\nvariations, a LMM method for vessel segmentation is compared to three\nstate-of-the-art approaches. Results show that the LMM approach has a better\nrobustness to such variations than the three others.\n","authors":["Guillaume Noyel"],"pdf_url":"https://arxiv.org/pdf/2309.02007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09917v3","updated":"2023-09-05T07:39:09Z","published":"2023-08-19T05:49:13Z","title":"Learning Multiscale Consistency for Self-supervised Electron Microscopy\n Instance Segmentation","summary":" Instance segmentation in electron microscopy (EM) volumes is tough due to\ncomplex shapes and sparse annotations. Self-supervised learning helps but still\nstruggles with intricate visual patterns in EM. To address this, we propose a\npretraining framework that enhances multiscale consistency in EM volumes. Our\napproach leverages a Siamese network architecture, integrating both strong and\nweak data augmentations to effectively extract multiscale features. We uphold\nvoxel-level coherence by reconstructing the original input data from these\naugmented instances. Furthermore, we incorporate cross-attention mechanisms to\nfacilitate fine-grained feature alignment between these augmentations. Finally,\nwe apply contrastive learning techniques across a feature pyramid, allowing us\nto distill distinctive representations spanning various scales. After\npretraining on four large-scale EM datasets, our framework significantly\nimproves downstream tasks like neuron and mitochondria segmentation, especially\nwith limited finetuning data. It effectively captures voxel and feature\nconsistency, showing promise for learning transferable representations for EM\nanalysis.\n","authors":["Yinda Chen","Wei Huang","Xiaoyu Liu","Shiyu Deng","Qi Chen","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.09917v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14334v2","updated":"2023-09-05T07:35:58Z","published":"2023-08-28T06:25:40Z","title":"MetaWeather: Few-Shot Weather-Degraded Image Restoration via Degradation\n Pattern Matching","summary":" Real-world vision tasks frequently suffer from the appearance of adverse\nweather conditions including rain, fog, snow, and raindrops in captured images.\nRecently, several generic methods for restoring weather-degraded images have\nbeen proposed, aiming to remove multiple types of adverse weather effects\npresent in the images. However, these methods have considered weather as\ndiscrete and mutually exclusive variables, leading to failure in generalizing\nto unforeseen weather conditions beyond the scope of the training data, such as\nthe co-occurrence of rain, fog, and raindrops. To this end, weather-degraded\nimage restoration models should have flexible adaptability to the current\nunknown weather condition to ensure reliable and optimal performance. The\nadaptation method should also be able to cope with data scarcity for real-world\nadaptation. This paper proposes MetaWeather, a few-shot weather-degraded image\nrestoration method for arbitrary weather conditions. For this, we devise the\ncore piece of MetaWeather, coined Degradation Pattern Matching Module (DPMM),\nwhich leverages representations from a few-shot support set by matching\nfeatures between input and sample images under new weather conditions. In\naddition, we build meta-knowledge with episodic meta-learning on top of our\nMetaWeather architecture to provide flexible adaptability. In the meta-testing\nphase, we adopt a parameter-efficient fine-tuning method to preserve the\nprebuilt knowledge and avoid the overfitting problem. Experiments on the BID\nTask II.A dataset show our method achieves the best performance on PSNR and\nSSIM compared to state-of-the-art image restoration methods. Code is available\nat (TBA).\n","authors":["Youngrae Kim","Younggeol Cho","Thanh-Tung Nguyen","Dongman Lee"],"pdf_url":"https://arxiv.org/pdf/2308.14334v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.02001v1","updated":"2023-09-05T07:31:22Z","published":"2023-09-05T07:31:22Z","title":"Analyzing domain shift when using additional data for the MICCAI KiTS23\n Challenge","summary":" Using additional training data is known to improve the results, especially\nfor medical image 3D segmentation where there is a lack of training material\nand the model needs to generalize well from few available data. However, the\nnew data could have been acquired using other instruments and preprocessed such\nits distribution is significantly different from the original training data.\nTherefore, we study techniques which ameliorate domain shift during training so\nthat the additional data becomes better usable for preprocessing and training\ntogether with the original data. Our results show that transforming the\nadditional data using histogram matching has better results than using simple\nnormalization.\n","authors":["George Stoica","Mihaela Breaban","Vlad Barbu"],"pdf_url":"https://arxiv.org/pdf/2309.02001v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n improvements or corrections. The Version of Record of this contribution is\n published in [TODO], and is available online at https://doi.org/[TODO]"},{"id":"http://arxiv.org/abs/2309.00233v2","updated":"2023-09-05T07:22:43Z","published":"2023-09-01T03:34:12Z","title":"Object-Centric Multiple Object Tracking","summary":" Unsupervised object-centric learning methods allow the partitioning of scenes\ninto entities without additional localization information and are excellent\ncandidates for reducing the annotation burden of multiple-object tracking (MOT)\npipelines. Unfortunately, they lack two key properties: objects are often split\ninto parts and are not consistently tracked over time. In fact,\nstate-of-the-art models achieve pixel-level accuracy and temporal consistency\nby relying on supervised object detection with additional ID labels for the\nassociation through time. This paper proposes a video object-centric model for\nMOT. It consists of an index-merge module that adapts the object-centric slots\ninto detection outputs and an object memory module that builds complete object\nprototypes to handle occlusions. Benefited from object-centric learning, we\nonly require sparse detection labels (0%-6.25%) for object localization and\nfeature binding. Relying on our self-supervised\nExpectation-Maximization-inspired loss for object association, our approach\nrequires no ID labels. Our experiments significantly narrow the gap between the\nexisting object-centric model and the fully supervised state-of-the-art and\noutperform several unsupervised trackers.\n","authors":["Zixu Zhao","Jiaze Wang","Max Horn","Yizhuo Ding","Tong He","Zechen Bai","Dominik Zietlow","Carl-Johann Simon-Gabriel","Bing Shuai","Zhuowen Tu","Thomas Brox","Bernt Schiele","Yanwei Fu","Francesco Locatello","Zheng Zhang","Tianjun Xiao"],"pdf_url":"https://arxiv.org/pdf/2309.00233v2.pdf","comment":"ICCV 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2212.11613v5","updated":"2023-09-05T07:16:52Z","published":"2022-12-22T11:17:57Z","title":"DDColor: Towards Photo-Realistic Image Colorization via Dual Decoders","summary":" Image colorization is a challenging problem due to multi-modal uncertainty\nand high ill-posedness. Directly training a deep neural network usually leads\nto incorrect semantic colors and low color richness. While transformer-based\nmethods can deliver better results, they often rely on manually designed\npriors, suffer from poor generalization ability, and introduce color bleeding\neffects. To address these issues, we propose DDColor, an end-to-end method with\ndual decoders for image colorization. Our approach includes a pixel decoder and\na query-based color decoder. The former restores the spatial resolution of the\nimage, while the latter utilizes rich visual features to refine color queries,\nthus avoiding hand-crafted priors. Our two decoders work together to establish\ncorrelations between color and multi-scale semantic representations via\ncross-attention, significantly alleviating the color bleeding effect.\nAdditionally, a simple yet effective colorfulness loss is introduced to enhance\nthe color richness. Extensive experiments demonstrate that DDColor achieves\nsuperior performance to existing state-of-the-art works both quantitatively and\nqualitatively. The codes and models are publicly available at\nhttps://github.com/piddnad/DDColor.\n","authors":["Xiaoyang Kang","Tao Yang","Wenqi Ouyang","Peiran Ren","Lingzhi Li","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2212.11613v5.pdf","comment":"ICCV 2023; Code: https://github.com/piddnad/DDColor"},{"id":"http://arxiv.org/abs/2306.08528v3","updated":"2023-09-05T05:35:31Z","published":"2023-06-14T14:22:56Z","title":"Predict to Detect: Prediction-guided 3D Object Detection using\n Sequential Images","summary":" Recent camera-based 3D object detection methods have introduced sequential\nframes to improve the detection performance hoping that multiple frames would\nmitigate the large depth estimation error. Despite improved detection\nperformance, prior works rely on naive fusion methods (e.g., concatenation) or\nare limited to static scenes (e.g., temporal stereo), neglecting the importance\nof the motion cue of objects. These approaches do not fully exploit the\npotential of sequential images and show limited performance improvements. To\naddress this limitation, we propose a novel 3D object detection model, P2D\n(Predict to Detect), that integrates a prediction scheme into a detection\nframework to explicitly extract and leverage motion features. P2D predicts\nobject information in the current frame using solely past frames to learn\ntemporal motion features. We then introduce a novel temporal feature\naggregation method that attentively exploits Bird's-Eye-View (BEV) features\nbased on predicted object information, resulting in accurate 3D object\ndetection. Experimental results demonstrate that P2D improves mAP and NDS by\n3.0% and 3.7% compared to the sequential image-based baseline, illustrating\nthat incorporating a prediction scheme can significantly improve detection\naccuracy.\n","authors":["Sanmin Kim","Youngseok Kim","In-Jae Lee","Dongsuk Kum"],"pdf_url":"https://arxiv.org/pdf/2306.08528v3.pdf","comment":"ICCV 2023, Code: https://github.com/sanmin0312/P2D"},{"id":"http://arxiv.org/abs/2206.08316v2","updated":"2023-09-05T05:33:46Z","published":"2022-06-16T17:22:40Z","title":"Boosting the Adversarial Transferability of Surrogate Models with Dark\n Knowledge","summary":" Deep neural networks (DNNs) are vulnerable to adversarial examples. And, the\nadversarial examples have transferability, which means that an adversarial\nexample for a DNN model can fool another model with a non-trivial probability.\nThis gave birth to the transfer-based attack where the adversarial examples\ngenerated by a surrogate model are used to conduct black-box attacks. There are\nsome work on generating the adversarial examples from a given surrogate model\nwith better transferability. However, training a special surrogate model to\ngenerate adversarial examples with better transferability is relatively\nunder-explored. This paper proposes a method for training a surrogate model\nwith dark knowledge to boost the transferability of the adversarial examples\ngenerated by the surrogate model. This trained surrogate model is named dark\nsurrogate model (DSM). The proposed method for training a DSM consists of two\nkey components: a teacher model extracting dark knowledge, and the mixing\naugmentation skill enhancing dark knowledge of training data. We conducted\nextensive experiments to show that the proposed method can substantially\nimprove the adversarial transferability of surrogate models across different\narchitectures of surrogate models and optimizers for generating adversarial\nexamples, and it can be applied to other scenarios of transfer-based attack\nthat contain dark knowledge, like face verification. Our code is publicly\navailable at \\url{https://github.com/ydc123/Dark_Surrogate_Model}.\n","authors":["Dingcheng Yang","Zihao Xiao","Wenjian Yu"],"pdf_url":"https://arxiv.org/pdf/2206.08316v2.pdf","comment":"Accepted at 2023 International Conference on Tools with Artificial\n Intelligence (ICTAI)"},{"id":"http://arxiv.org/abs/2309.01961v1","updated":"2023-09-05T05:32:19Z","published":"2023-09-05T05:32:19Z","title":"NICE 2023 Zero-shot Image Captioning Challenge","summary":" In this report, we introduce NICE\nproject\\footnote{\\url{https://nice.lgresearch.ai/}} and share the results and\noutcomes of NICE challenge 2023. This project is designed to challenge the\ncomputer vision community to develop robust image captioning models that\nadvance the state-of-the-art both in terms of accuracy and fairness. Through\nthe challenge, the image captioning models were tested using a new evaluation\ndataset that includes a large variety of visual concepts from many domains.\nThere was no specific training data provided for the challenge, and therefore\nthe challenge entries were required to adapt to new types of image descriptions\nthat had not been seen during training. This report includes information on the\nnewly proposed NICE dataset, evaluation methods, challenge results, and\ntechnical details of top-ranking entries. We expect that the outcomes of the\nchallenge will contribute to the improvement of AI models on various\nvision-language tasks.\n","authors":["Taehoon Kim","Pyunghwan Ahn","Sangyun Kim","Sihaeng Lee","Mark Marsden","Alessandra Sala","Seung Hwan Kim","Honglak Lee","Kyounghoon Bae","Bohyung Han","Kyoung Mu Lee","Xiangyu Wu","Yi Gao","Hailiang Zhang","Yang Yang","Weili Guo","Jianfeng Lu","Youngtaek Oh","Jae Won Cho","Dong-jin Kim","In So Kweon","Junmo Kim","Wooyoung Kang","Won Young Jhoo","Byungseok Roh","Jonghwan Mun","Solgil Oh","Kenan Emir Ak","Gwang-Gook Lee","Yan Xu","Mingwei Shen","Kyomin Hwang","Wonsik Shin","Kamin Lee","Wonhark Park","Dongkwan Lee","Nojun Kwak","Yujin Wang","Yimu Wang","Tiancheng Gu","Xingchang Lv","Mingmao Sun"],"pdf_url":"https://arxiv.org/pdf/2309.01961v1.pdf","comment":"Tech report"},{"id":"http://arxiv.org/abs/2309.01958v1","updated":"2023-09-05T05:20:11Z","published":"2023-09-05T05:20:11Z","title":"Empowering Low-Light Image Enhancer through Customized Learnable Priors","summary":" Deep neural networks have achieved remarkable progress in enhancing low-light\nimages by improving their brightness and eliminating noise. However, most\nexisting methods construct end-to-end mapping networks heuristically,\nneglecting the intrinsic prior of image enhancement task and lacking\ntransparency and interpretability. Although some unfolding solutions have been\nproposed to relieve these issues, they rely on proximal operator networks that\ndeliver ambiguous and implicit priors. In this work, we propose a paradigm for\nlow-light image enhancement that explores the potential of customized learnable\npriors to improve the transparency of the deep unfolding paradigm. Motivated by\nthe powerful feature representation capability of Masked Autoencoder (MAE), we\ncustomize MAE-based illumination and noise priors and redevelop them from two\nperspectives: 1) \\textbf{structure flow}: we train the MAE from a normal-light\nimage to its illumination properties and then embed it into the proximal\noperator design of the unfolding architecture; and m2) \\textbf{optimization\nflow}: we train MAE from a normal-light image to its gradient representation\nand then employ it as a regularization term to constrain noise in the model\noutput. These designs improve the interpretability and representation\ncapability of the model.Extensive experiments on multiple low-light image\nenhancement datasets demonstrate the superiority of our proposed paradigm over\nstate-of-the-art methods. Code is available at\nhttps://github.com/zheng980629/CUE.\n","authors":["Naishan Zheng","Man Zhou","Yanmeng Dong","Xiangyu Rui","Jie Huang","Chongyi Li","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.01958v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.15791v3","updated":"2023-09-05T05:17:42Z","published":"2023-08-30T06:49:34Z","title":"Neural Video Compression with Temporal Layer-Adaptive Hierarchical\n B-frame Coding","summary":" Neural video compression (NVC) is a rapidly evolving video coding research\narea, with some models achieving superior coding efficiency compared to the\nlatest video coding standard Versatile Video Coding (VVC). In conventional\nvideo coding standards, the hierarchical B-frame coding, which utilizes a\nbidirectional prediction structure for higher compression, had been\nwell-studied and exploited. In NVC, however, limited research has investigated\nthe hierarchical B scheme. In this paper, we propose an NVC model exploiting\nhierarchical B-frame coding with temporal layer-adaptive optimization. We first\nextend an existing unidirectional NVC model to a bidirectional model, which\nachieves -21.13% BD-rate gain over the unidirectional baseline model. However,\nthis model faces challenges when applied to sequences with complex or large\nmotions, leading to performance degradation. To address this, we introduce\ntemporal layer-adaptive optimization, incorporating methods such as temporal\nlayer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent\nscaling (TALS). The final model with the proposed methods achieves an\nimpressive BD-rate gain of -39.86% against the baseline. It also resolves the\nchallenges in sequences with large or complex motions with up to -49.13% more\nBD-rate gains than the simple bidirectional extension. This improvement is\nattributed to the allocation of more bits to lower temporal layers, thereby\nenhancing overall reconstruction quality with smaller bits. Since our method\nhas little dependency on a specific NVC model architecture, it can serve as a\ngeneral tool for extending unidirectional NVC models to the ones with\nhierarchical B-frame coding.\n","authors":["Yeongwoong Kim","Suyong Bahk","Seungeon Kim","Won Hee Lee","Dokwan Oh","Hui Yong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.15791v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01950v1","updated":"2023-09-05T04:56:18Z","published":"2023-09-05T04:56:18Z","title":"RADIO: Reference-Agnostic Dubbing Video Synthesis","summary":" One of the most challenging problems in audio-driven talking head generation\nis achieving high-fidelity detail while ensuring precise synchronization. Given\nonly a single reference image, extracting meaningful identity attributes\nbecomes even more challenging, often causing the network to mirror the facial\nand lip structures too closely. To address these issues, we introduce RADIO, a\nframework engineered to yield high-quality dubbed videos regardless of the pose\nor expression in reference images. The key is to modulate the decoder layers\nusing latent space composed of audio and reference features. Additionally, we\nincorporate ViT blocks into the decoder to emphasize high-fidelity details,\nespecially in the lip region. Our experimental results demonstrate that RADIO\ndisplays high synchronization without the loss of fidelity. Especially in harsh\nscenarios where the reference frame deviates significantly from the ground\ntruth, our method outperforms state-of-the-art methods, highlighting its\nrobustness. Pre-trained model and codes will be made public after the review.\n","authors":["Dongyeun Lee","Chaewon Kim","Sangjoon Yu","Jaejun Yoo","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2309.01950v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.01949v1","updated":"2023-09-05T04:55:10Z","published":"2023-09-05T04:55:10Z","title":"Efficient Bayesian Computational Imaging with a Surrogate Score-Based\n Prior","summary":" We propose a surrogate function for efficient use of score-based priors for\nBayesian inverse imaging. Recent work turned score-based diffusion models into\nprobabilistic priors for solving ill-posed imaging problems by appealing to an\nODE-based log-probability function. However, evaluating this function is\ncomputationally inefficient and inhibits posterior estimation of\nhigh-dimensional images. Our proposed surrogate prior is based on the evidence\nlower-bound of a score-based diffusion model. We demonstrate the surrogate\nprior on variational inference for efficient approximate posterior sampling of\nlarge images. Compared to the exact prior in previous work, our surrogate prior\naccelerates optimization of the variational image distribution by at least two\norders of magnitude. We also find that our principled approach achieves\nhigher-fidelity images than non-Bayesian baselines that involve\nhyperparameter-tuning at inference. Our work establishes a practical path\nforward for using score-based diffusion models as general-purpose priors for\nimaging.\n","authors":["Berthy T. Feng","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2309.01949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00265v2","updated":"2023-09-05T04:34:45Z","published":"2023-09-01T05:50:47Z","title":"Application of Machine Learning in Melanoma Detection and the\n Identification of 'Ugly Duckling' and Suspicious Naevi: A Review","summary":" Skin lesions known as naevi exhibit diverse characteristics such as size,\nshape, and colouration. The concept of an \"Ugly Duckling Naevus\" comes into\nplay when monitoring for melanoma, referring to a lesion with distinctive\nfeatures that sets it apart from other lesions in the vicinity. As lesions\nwithin the same individual typically share similarities and follow a\npredictable pattern, an ugly duckling naevus stands out as unusual and may\nindicate the presence of a cancerous melanoma. Computer-aided diagnosis (CAD)\nhas become a significant player in the research and development field, as it\ncombines machine learning techniques with a variety of patient analysis\nmethods. Its aim is to increase accuracy and simplify decision-making, all\nwhile responding to the shortage of specialized professionals. These automated\nsystems are especially important in skin cancer diagnosis where specialist\navailability is limited. As a result, their use could lead to life-saving\nbenefits and cost reductions within healthcare. Given the drastic change in\nsurvival when comparing early stage to late-stage melanoma, early detection is\nvital for effective treatment and patient outcomes. Machine learning (ML) and\ndeep learning (DL) techniques have gained popularity in skin cancer\nclassification, effectively addressing challenges, and providing results\nequivalent to that of specialists. This article extensively covers modern\nMachine Learning and Deep Learning algorithms for detecting melanoma and\nsuspicious naevi. It begins with general information on skin cancer and\ndifferent types of naevi, then introduces AI, ML, DL, and CAD. The article then\ndiscusses the successful applications of various ML techniques like\nconvolutional neural networks (CNN) for melanoma detection compared to\ndermatologists' performance. Lastly, it examines ML methods for UD naevus\ndetection and identifying suspicious naevi.\n","authors":["Fatima Al Zegair","Nathasha Naranpanawa","Brigid Betz-Stablein","Monika Janda","H. Peter Soyer","Shekhar S. Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.00265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01661v2","updated":"2023-09-05T04:20:07Z","published":"2023-01-04T15:12:57Z","title":"RecRecNet: Rectangling Rectified Wide-Angle Images by Thin-Plate Spline\n Model and DoF-based Curriculum Learning","summary":" The wide-angle lens shows appealing applications in VR technologies, but it\nintroduces severe radial distortion into its captured image. To recover the\nrealistic scene, previous works devote to rectifying the content of the\nwide-angle image. However, such a rectification solution inevitably distorts\nthe image boundary, which changes related geometric distributions and misleads\nthe current vision perception models. In this work, we explore constructing a\nwin-win representation on both content and boundary by contributing a new\nlearning model, i.e., Rectangling Rectification Network (RecRecNet). In\nparticular, we propose a thin-plate spline (TPS) module to formulate the\nnon-linear and non-rigid transformation for rectangling images. By learning the\ncontrol points on the rectified image, our model can flexibly warp the source\nstructure to the target domain and achieves an end-to-end unsupervised\ndeformation. To relieve the complexity of structure approximation, we then\ninspire our RecRecNet to learn the gradual deformation rules with a DoF (Degree\nof Freedom)-based curriculum learning. By increasing the DoF in each curriculum\nstage, namely, from similarity transformation (4-DoF) to homography\ntransformation (8-DoF), the network is capable of investigating more detailed\ndeformations, offering fast convergence on the final rectangling task.\nExperiments show the superiority of our solution over the compared methods on\nboth quantitative and qualitative evaluations. The code and dataset are\navailable at https://github.com/KangLiao929/RecRecNet.\n","authors":["Kang Liao","Lang Nie","Chunyu Lin","Zishuo Zheng","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2301.01661v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.01943v1","updated":"2023-09-05T04:18:03Z","published":"2023-09-05T04:18:03Z","title":"Extract-and-Adaptation Network for 3D Interacting Hand Mesh Recovery","summary":" Understanding how two hands interact with each other is a key component of\naccurate 3D interacting hand mesh recovery. However, recent Transformer-based\nmethods struggle to learn the interaction between two hands as they directly\nutilize two hand features as input tokens, which results in distant token\nproblem. The distant token problem represents that input tokens are in\nheterogeneous spaces, leading Transformer to fail in capturing correlation\nbetween input tokens. Previous Transformer-based methods suffer from the\nproblem especially when poses of two hands are very different as they project\nfeatures from a backbone to separate left and right hand-dedicated features. We\npresent EANet, extract-and-adaptation network, with EABlock, the main component\nof our network. Rather than directly utilizing two hand features as input\ntokens, our EABlock utilizes two complementary types of novel tokens, SimToken\nand JoinToken, as input tokens. Our two novel tokens are from a combination of\nseparated two hand features; hence, it is much more robust to the distant token\nproblem. Using the two type of tokens, our EABlock effectively extracts\ninteraction feature and adapts it to each hand. The proposed EANet achieves the\nstate-of-the-art performance on 3D interacting hands benchmarks. The codes are\navailable at https://github.com/jkpark0825/EANet.\n","authors":["JoonKyu Park","Daniel Sungho Jung","Gyeongsik Moon","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2309.01943v1.pdf","comment":"Accepted at ICCVW 2023"},{"id":"http://arxiv.org/abs/2307.16834v2","updated":"2023-09-05T03:51:50Z","published":"2023-07-28T17:16:57Z","title":"Benchmarking Jetson Edge Devices with an End-to-end Video-based Anomaly\n Detection System","summary":" Innovative enhancement in embedded system platforms, specifically hardware\naccelerations, significantly influence the application of deep learning in\nreal-world scenarios. These innovations translate human labor efforts into\nautomated intelligent systems employed in various areas such as autonomous\ndriving, robotics, Internet-of-Things (IoT), and numerous other impactful\napplications. NVIDIA's Jetson platform is one of the pioneers in offering\noptimal performance regarding energy efficiency and throughput in the execution\nof deep learning algorithms. Previously, most benchmarking analysis was based\non 2D images with a single deep learning model for each comparison result. In\nthis paper, we implement an end-to-end video-based crime-scene anomaly\ndetection system inputting from surveillance videos and the system is deployed\nand completely operates on multiple Jetson edge devices (Nano, AGX Xavier, Orin\nNano). The comparison analysis includes the integration of Torch-TensorRT as a\nsoftware developer kit from NVIDIA for the model performance optimisation. The\nsystem is built based on the PySlowfast open-source project from Facebook as\nthe coding template. The end-to-end system process comprises the videos from\ncamera, data preprocessing pipeline, feature extractor and the anomaly\ndetection. We provide the experience of an AI-based system deployment on\nvarious Jetson Edge devices with Docker technology. Regarding anomaly\ndetectors, a weakly supervised video-based deep learning model called Robust\nTemporal Feature Magnitude Learning (RTFM) is applied in the system. The\napproach system reaches 47.56 frames per second (FPS) inference speed on a\nJetson edge device with only 3.11 GB RAM usage total. We also discover the\npromising Jetson device that the AI system achieves 15% better performance than\nthe previous version of Jetson devices while consuming 50% less energy power.\n","authors":["Hoang Viet Pham","Thinh Gia Tran","Chuong Dinh Le","An Dinh Le","Hien Bich Vo"],"pdf_url":"https://arxiv.org/pdf/2307.16834v2.pdf","comment":"18 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2303.13396v3","updated":"2023-09-05T03:50:24Z","published":"2023-03-23T16:15:07Z","title":"Zero-guidance Segmentation Using Zero Segment Labels","summary":" CLIP has enabled new and exciting joint vision-language applications, one of\nwhich is open-vocabulary segmentation, which can locate any segment given an\narbitrary text query. In our research, we ask whether it is possible to\ndiscover semantic segments without any user guidance in the form of text\nqueries or predefined classes, and label them using natural language\nautomatically? We propose a novel problem zero-guidance segmentation and the\nfirst baseline that leverages two pre-trained generalist models, DINO and CLIP,\nto solve this problem without any fine-tuning or segmentation dataset. The\ngeneral idea is to first segment an image into small over-segments, encode them\ninto CLIP's visual-language space, translate them into text labels, and merge\nsemantically similar segments together. The key challenge, however, is how to\nencode a visual segment into a segment-specific embedding that balances global\nand local context information, both useful for recognition. Our main\ncontribution is a novel attention-masking technique that balances the two\ncontexts by analyzing the attention layers inside CLIP. We also introduce\nseveral metrics for the evaluation of this new task. With CLIP's innate\nknowledge, our method can precisely locate the Mona Lisa painting among a\nmuseum crowd. Project page: https://zero-guide-seg.github.io/.\n","authors":["Pitchaporn Rewatbowornwong","Nattanat Chatthee","Ekapol Chuangsuwanich","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2303.13396v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05967v2","updated":"2023-09-05T03:44:01Z","published":"2023-08-11T06:54:55Z","title":"YOLOrtho -- A Unified Framework for Teeth Enumeration and Dental Disease\n Detection","summary":" Detecting dental diseases through panoramic X-rays images is a standard\nprocedure for dentists. Normally, a dentist need to identify diseases and find\nthe infected teeth. While numerous machine learning models adopting this\ntwo-step procedure have been developed, there has not been an end-to-end model\nthat can identify teeth and their associated diseases at the same time. To fill\nthe gap, we develop YOLOrtho, a unified framework for teeth enumeration and\ndental disease detection. We develop our model on Dentex Challenge 2023 data,\nwhich consists of three distinct types of annotated data. The first part is\nlabeled with quadrant, and the second part is labeled with quadrant and\nenumeration and the third part is labeled with quadrant, enumeration and\ndisease. To further improve detection, we make use of Tufts Dental public\ndataset. To fully utilize the data and learn both teeth detection and disease\nidentification simultaneously, we formulate diseases as attributes attached to\ntheir corresponding teeth. Due to the nature of position relation in teeth\nenumeration, We replace convolution layer with CoordConv in our model to\nprovide more position information for the model. We also adjust the model\narchitecture and insert one more upsampling layer in FPN in favor of large\nobject detection. Finally, we propose a post-process strategy for teeth layout\nthat corrects teeth enumeration based on linear sum assignment. Results from\nexperiments show that our model exceeds large Diffusion-based model.\n","authors":["Shenxiao Mei","Chenglong Ma","Feihong Shen","Huikai Wu"],"pdf_url":"https://arxiv.org/pdf/2308.05967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14730v3","updated":"2023-09-05T03:38:17Z","published":"2023-05-24T05:06:59Z","title":"BinaryViT: Towards Efficient and Accurate Binary Vision Transformers","summary":" Vision Transformers (ViTs) have emerged as the fundamental architecture for\nmost computer vision fields, but the considerable memory and computation costs\nhinders their application on resource-limited devices. As one of the most\npowerful compression methods, binarization reduces the computation of the\nneural network by quantizing the weights and activation values as $\\pm$1.\nAlthough existing binarization methods have demonstrated excellent performance\non Convolutional Neural Networks (CNNs), the full binarization of ViTs is still\nunder-studied and suffering a significant performance drop. In this paper, we\nfirst argue empirically that the severe performance degradation is mainly\ncaused by the weight oscillation in the binarization training and the\ninformation distortion in the activation of ViTs. Based on these analyses, we\npropose $\\textbf{BinaryViT}$, an accurate full binarization scheme for ViTs,\nwhich pushes the quantization of ViTs to the limit. Specifically, we propose a\nnovel gradient regularization scheme (GRS) for driving a bimodal distribution\nof the weights to reduce oscillation in binarization training. Moreover, we\ndesign an activation shift module (ASM) to adaptively tune the activation\ndistribution to reduce the information distortion caused by binarization.\nExtensive experiments on ImageNet dataset show that our BinaryViT consistently\nsurpasses the strong baseline by 2.05% and improve the accuracy of fully\nbinarized ViTs to a usable level. Furthermore, our method achieves impressive\nsavings of 16.2$\\times$ and 17.7$\\times$ in model size and OPs compared to the\nfull-precision DeiT-S.\n","authors":["Junrui Xiao","Zhikai Li","Lianwei Yang","Qingyi Gu"],"pdf_url":"https://arxiv.org/pdf/2305.14730v3.pdf","comment":"We will be making some significant changes to the paper, including\n the title and methodology. We therefore wish to withdraw the paper for now"},{"id":"http://arxiv.org/abs/2309.01925v1","updated":"2023-09-05T03:24:09Z","published":"2023-09-05T03:24:09Z","title":"DR-Pose: A Two-stage Deformation-and-Registration Pipeline for\n Category-level 6D Object Pose Estimation","summary":" Category-level object pose estimation involves estimating the 6D pose and the\n3D metric size of objects from predetermined categories. While recent\napproaches take categorical shape prior information as reference to improve\npose estimation accuracy, the single-stage network design and training manner\nlead to sub-optimal performance since there are two distinct tasks in the\npipeline. In this paper, the advantage of two-stage pipeline over single-stage\ndesign is discussed. To this end, we propose a two-stage deformation-and\nregistration pipeline called DR-Pose, which consists of completion-aided\ndeformation stage and scaled registration stage. The first stage uses a point\ncloud completion method to generate unseen parts of target object, guiding\nsubsequent deformation on the shape prior. In the second stage, a novel\nregistration network is designed to extract pose-sensitive features and predict\nthe representation of object partial point cloud in canonical space based on\nthe deformation results from the first stage. DR-Pose produces superior results\nto the state-of-the-art shape prior-based methods on both CAMERA25 and REAL275\nbenchmarks. Codes are available at https://github.com/Zray26/DR-Pose.git.\n","authors":["Lei Zhou","Zhiyang Liu","Runze Gan","Haozhe Wang","Marcelo H. Ang Jr"],"pdf_url":"https://arxiv.org/pdf/2309.01925v1.pdf","comment":"Camera-ready version accepted to IROS 2023"},{"id":"http://arxiv.org/abs/2308.08871v2","updated":"2023-09-05T03:21:26Z","published":"2023-08-17T09:04:44Z","title":"Spatially and Spectrally Consistent Deep Functional Maps","summary":" Cycle consistency has long been exploited as a powerful prior for jointly\noptimizing maps within a collection of shapes. In this paper, we investigate\nits utility in the approaches of Deep Functional Maps, which are considered\nstate-of-the-art in non-rigid shape matching. We first justify that under\ncertain conditions, the learned maps, when represented in the spectral domain,\nare already cycle consistent. Furthermore, we identify the discrepancy that\nspectrally consistent maps are not necessarily spatially, or point-wise,\nconsistent. In light of this, we present a novel design of unsupervised Deep\nFunctional Maps, which effectively enforces the harmony of learned maps under\nthe spectral and the point-wise representation. By taking advantage of cycle\nconsistency, our framework produces state-of-the-art results in mapping shapes\neven under significant distortions. Beyond that, by independently estimating\nmaps in both spectral and spatial domains, our method naturally alleviates\nover-fitting in network training, yielding superior generalization performance\nand accuracy within an array of challenging tests for both near-isometric and\nnon-isometric datasets. Codes are available at\nhttps://github.com/rqhuang88/Spatiallyand-Spectrally-Consistent-Deep-Functional-Maps.\n","authors":["Mingze Sun","Shiwei Mao","Puhua Jiang","Maks Ovsjanikov","Ruqi Huang"],"pdf_url":"https://arxiv.org/pdf/2308.08871v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2309.01921v1","updated":"2023-09-05T03:20:41Z","published":"2023-09-05T03:20:41Z","title":"Causal Scoring Medical Image Explanations: A Case Study On Ex-vivo\n Kidney Stone Images","summary":" On the promise that if human users know the cause of an output, it would\nenable them to grasp the process responsible for the output, and hence provide\nunderstanding, many explainable methods have been proposed to indicate the\ncause for the output of a model based on its input. Nonetheless, little has\nbeen reported on quantitative measurements of such causal relationships between\nthe inputs, the explanations, and the outputs of a model, leaving the\nassessment to the user, independent of his level of expertise in the subject.\nTo address this situation, we explore a technique for measuring the causal\nrelationship between the features from the area of the object of interest in\nthe images of a class and the output of a classifier. Our experiments indicate\nimprovement in the causal relationships measured when the area of the object of\ninterest per class is indicated by a mask from an explainable method than when\nit is indicated by human annotators. Hence the chosen name of Causal\nExplanation Score (CaES)\n","authors":["Armando Villegas-Jimenez","Daniel Flores-Araiza","Francisco Lopez-Tiro","Gilberto Ochoa-Ruiz andand Christian Daul"],"pdf_url":"https://arxiv.org/pdf/2309.01921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02463v3","updated":"2023-09-05T03:11:11Z","published":"2023-08-04T17:00:38Z","title":"Towards Generalist Foundation Model for Radiology","summary":" In this study, we aim to initiate the development of Radiology Foundation\nModel, termed as RadFM.We consider the construction of foundational models from\nthe perspectives of dataset construction, model design, and thorough\nevaluation. Our contribution can be concluded as follows: (i), we construct a\nlarge-scale Medical Multi-modal Dataset, MedMD, which consists of 16M 2D and 3D\nmedical scans with high-quality text descriptions or reports across various\ndata formats, modalities, and tasks, covering over 5000 distinct diseases. To\nthe best of our knowledge, this is the first large-scale, high-quality, medical\nvisual-language dataset, with both 2D and 3D scans; (ii ), we propose an\narchitecture that enables visually conditioned generative pre-training, i.e.,\nallowing for integration of text input with 2D or 3D medical scans, and\ngenerate responses for diverse radiologic tasks. The model was initially\npre-trained on MedMD and subsequently fine-tuned on the domain-specific\ndataset, which is a radiologic cleaned version of MedMD, containing 3M\nradiologic visual-language pairs, termed as RadMD; (iii), we propose a new\nevaluation benchmark, RadBench, that comprises five tasks, including modality\nrecognition, disease diagnosis, visual question answering, report generation\nand rationale diagnosis, aiming to comprehensively assess the capability of\nfoundation models in handling practical clinical problems. We conduct both\nautomatic and human evaluation on RadBench, in both cases, RadFM significantly\noutperforms existing multi-modal foundation models. The codes, data, and model\ncheckpoint will all be made publicly available to promote further research and\ndevelopment in the field.\n","authors":["Chaoyi Wu","Xiaoman Zhang","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2308.02463v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10143v2","updated":"2023-09-05T02:52:36Z","published":"2023-05-17T11:56:40Z","title":"An Empirical Study on the Language Modal in Visual Question Answering","summary":" Generalization beyond in-domain experience to out-of-distribution data is of\nparamount significance in the AI domain. Of late, state-of-the-art Visual\nQuestion Answering (VQA) models have shown impressive performance on in-domain\ndata, partially due to the language priors bias which, however, hinders the\ngeneralization ability in practice. This paper attempts to provide new insights\ninto the influence of language modality on VQA performance from an empirical\nstudy perspective. To achieve this, we conducted a series of experiments on six\nmodels. The results of these experiments revealed that, 1) apart from prior\nbias caused by question types, there is a notable influence of postfix-related\nbias in inducing biases, and 2) training VQA models with word-sequence-related\nvariant questions demonstrated improved performance on the out-of-distribution\nbenchmark, and the LXMERT even achieved a 10-point gain without adopting any\ndebiasing methods. We delved into the underlying reasons behind these\nexperimental results and put forward some simple proposals to reduce the\nmodels' dependency on language priors. The experimental results demonstrated\nthe effectiveness of our proposed method in improving performance on the\nout-of-distribution benchmark, VQA-CPv2. We hope this study can inspire novel\ninsights for future research on designing bias-reduction approaches.\n","authors":["Daowan Peng","Wei Wei","Xian-Ling Mao","Yuanyuan Fu","Dangyang Chen"],"pdf_url":"https://arxiv.org/pdf/2305.10143v2.pdf","comment":"Accepted by IJCAI2023"},{"id":"http://arxiv.org/abs/2309.01907v1","updated":"2023-09-05T02:42:41Z","published":"2023-09-05T02:42:41Z","title":"SyntheWorld: A Large-Scale Synthetic Dataset for Land Cover Mapping and\n Building Change Detection","summary":" Synthetic datasets, recognized for their cost effectiveness, play a pivotal\nrole in advancing computer vision tasks and techniques. However, when it comes\nto remote sensing image processing, the creation of synthetic datasets becomes\nchallenging due to the demand for larger-scale and more diverse 3D models. This\ncomplexity is compounded by the difficulties associated with real remote\nsensing datasets, including limited data acquisition and high annotation costs,\nwhich amplifies the need for high-quality synthetic alternatives. To address\nthis, we present SyntheWorld, a synthetic dataset unparalleled in quality,\ndiversity, and scale. It includes 40,000 images with submeter-level pixels and\nfine-grained land cover annotations of eight categories, and it also provides\n40,000 pairs of bitemporal image pairs with building change annotations for\nbuilding change detection task. We conduct experiments on multiple benchmark\nremote sensing datasets to verify the effectiveness of SyntheWorld and to\ninvestigate the conditions under which our synthetic data yield advantages. We\nwill release SyntheWorld to facilitate remote sensing image processing\nresearch.\n","authors":["Jian Song","Hongruixuan Chen","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2309.01907v1.pdf","comment":"Accepted by WACV 2024"},{"id":"http://arxiv.org/abs/2309.01904v1","updated":"2023-09-05T02:31:04Z","published":"2023-09-05T02:31:04Z","title":"Improving Drone Imagery For Computer Vision/Machine Learning in\n Wilderness Search and Rescue","summary":" This paper describes gaps in acquisition of drone imagery that impair the use\nwith computer vision/machine learning (CV/ML) models and makes five\nrecommendations to maximize image suitability for CV/ML post-processing. It\ndescribes a notional work process for the use of drones in wilderness search\nand rescue incidents. The large volume of data from the wide area search phase\noffers the greatest opportunity for CV/ML techniques because of the large\nnumber of images that would otherwise have to be manually inspected. The 2023\nWu-Murad search in Japan, one of the largest missing person searches conducted\nin that area, serves as a case study. Although drone teams conducting wide area\nsearches may not know in advance if the data they collect is going to be used\nfor CV/ML post-processing, there are data collection procedures that can\nimprove the search in general with automated collection software. If the drone\nteams do expect to use CV/ML, then they can exploit knowledge about the model\nto further optimize flights.\n","authors":["Robin Murphy","Thomas Manzini"],"pdf_url":"https://arxiv.org/pdf/2309.01904v1.pdf","comment":"6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.01903v1","updated":"2023-09-05T02:26:42Z","published":"2023-09-05T02:26:42Z","title":"Towards Robust Plant Disease Diagnosis with Hard-sample Re-mining\n Strategy","summary":" With rich annotation information, object detection-based automated plant\ndisease diagnosis systems (e.g., YOLO-based systems) often provide advantages\nover classification-based systems (e.g., EfficientNet-based), such as the\nability to detect disease locations and superior classification performance.\nOne drawback of these detection systems is dealing with unannotated healthy\ndata with no real symptoms present. In practice, healthy plant data appear to\nbe very similar to many disease data. Thus, those models often produce\nmis-detected boxes on healthy images. In addition, labeling new data for\ndetection models is typically time-consuming. Hard-sample mining (HSM) is a\ncommon technique for re-training a model by using the mis-detected boxes as new\ntraining samples. However, blindly selecting an arbitrary amount of hard-sample\nfor re-training will result in the degradation of diagnostic performance for\nother diseases due to the high similarity between disease and healthy data. In\nthis paper, we propose a simple but effective training strategy called\nhard-sample re-mining (HSReM), which is designed to enhance the diagnostic\nperformance of healthy data and simultaneously improve the performance of\ndisease data by strategically selecting hard-sample training images at an\nappropriate level. Experiments based on two practical in-field eight-class\ncucumber and ten-class tomato datasets (42.7K and 35.6K images) show that our\nHSReM training strategy leads to a substantial improvement in the overall\ndiagnostic performance on large-scale unseen data. Specifically, the object\ndetection model trained using the HSReM strategy not only achieved superior\nresults as compared to the classification-based state-of-the-art\nEfficientNetV2-Large model and the original object detection model, but also\noutperformed the model using the HSM strategy.\n","authors":["Quan Huu Cap","Atsushi Fukuda","Satoshi Kagiwada","Hiroyuki Uga","Nobusuke Iwasaki","Hitoshi Iyatomi"],"pdf_url":"https://arxiv.org/pdf/2309.01903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09221v3","updated":"2023-09-05T02:17:19Z","published":"2023-02-18T03:42:31Z","title":"Moby: Empowering 2D Models for Efficient Point Cloud Analytics on the\n Edge","summary":" 3D object detection plays a pivotal role in many applications, most notably\nautonomous driving and robotics. These applications are commonly deployed on\nedge devices to promptly interact with the environment, and often require near\nreal-time response. With limited computation power, it is challenging to\nexecute 3D detection on the edge using highly complex neural networks. Common\napproaches such as offloading to the cloud induce significant latency overheads\ndue to the large amount of point cloud data during transmission. To resolve the\ntension between wimpy edge devices and compute-intensive inference workloads,\nwe explore the possibility of empowering fast 2D detection to extrapolate 3D\nbounding boxes. To this end, we present Moby, a novel system that demonstrates\nthe feasibility and potential of our approach. We design a transformation\npipeline for Moby that generates 3D bounding boxes efficiently and accurately\nbased on 2D detection results without running 3D detectors. Further, we devise\na frame offloading scheduler that decides when to launch the 3D detector\njudiciously in the cloud to avoid the errors from accumulating. Extensive\nevaluations on NVIDIA Jetson TX2 with real-world autonomous driving datasets\ndemonstrate that Moby offers up to 91.9% latency improvement with modest\naccuracy loss over state of the art.\n","authors":["Jingzong Li","Yik Hong Cai","Libin Liu","Yu Mao","Chun Jason Xue","Hong Xu"],"pdf_url":"https://arxiv.org/pdf/2302.09221v3.pdf","comment":"Accepted to ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2309.01899v1","updated":"2023-09-05T02:15:51Z","published":"2023-09-05T02:15:51Z","title":"Unsupervised Skin Lesion Segmentation via Structural Entropy\n Minimization on Multi-Scale Superpixel Graphs","summary":" Skin lesion segmentation is a fundamental task in dermoscopic image analysis.\nThe complex features of pixels in the lesion region impede the lesion\nsegmentation accuracy, and existing deep learning-based methods often lack\ninterpretability to this problem. In this work, we propose a novel unsupervised\nSkin Lesion sEgmentation framework based on structural entropy and isolation\nforest outlier Detection, namely SLED. Specifically, skin lesions are segmented\nby minimizing the structural entropy of a superpixel graph constructed from the\ndermoscopic image. Then, we characterize the consistency of healthy skin\nfeatures and devise a novel multi-scale segmentation mechanism by outlier\ndetection, which enhances the segmentation accuracy by leveraging the\nsuperpixel features from multiple scales. We conduct experiments on four skin\nlesion benchmarks and compare SLED with nine representative unsupervised\nsegmentation methods. Experimental results demonstrate the superiority of the\nproposed framework. Additionally, some case studies are analyzed to demonstrate\nthe effectiveness of SLED.\n","authors":["Guangjie Zeng","Hao Peng","Angsheng Li","Zhiwei Liu","Chunyang Liu","Philip S. Yu","Lifang He"],"pdf_url":"https://arxiv.org/pdf/2309.01899v1.pdf","comment":"10 pages, 8 figures, conference. Accepted by IEEE ICDM 2023"},{"id":"http://arxiv.org/abs/2307.04129v2","updated":"2023-09-05T02:09:30Z","published":"2023-07-09T08:58:47Z","title":"Cross-modal Orthogonal High-rank Augmentation for RGB-Event\n Transformer-trackers","summary":" This paper addresses the problem of cross-modal object tracking from RGB\nvideos and event data. Rather than constructing a complex cross-modal fusion\nnetwork, we explore the great potential of a pre-trained vision Transformer\n(ViT). Particularly, we delicately investigate plug-and-play training\naugmentations that encourage the ViT to bridge the vast distribution gap\nbetween the two modalities, enabling comprehensive cross-modal information\ninteraction and thus enhancing its ability. Specifically, we propose a mask\nmodeling strategy that randomly masks a specific modality of some tokens to\nenforce the interaction between tokens from different modalities interacting\nproactively. To mitigate network oscillations resulting from the masking\nstrategy and further amplify its positive effect, we then theoretically propose\nan orthogonal high-rank loss to regularize the attention matrix. Extensive\nexperiments demonstrate that our plug-and-play training augmentation techniques\ncan significantly boost state-of-the-art one-stream and twostream trackers to a\nlarge extent in terms of both tracking precision and success rate. Our new\nperspective and findings will potentially bring insights to the field of\nleveraging powerful pre-trained ViTs to model cross-modal data. The code will\nbe publicly available.\n","authors":["Zhiyu Zhu","Junhui Hou","Dapeng Oliver Wu"],"pdf_url":"https://arxiv.org/pdf/2307.04129v2.pdf","comment":"accepted by ICCV"},{"id":"http://arxiv.org/abs/2305.06221v3","updated":"2023-09-05T01:56:58Z","published":"2023-05-10T14:54:29Z","title":"Multi-Prompt with Depth Partitioned Cross-Modal Learning","summary":" In recent years, soft prompt learning methods have been proposed to fine-tune\nlarge-scale vision-language pre-trained models for various downstream tasks.\nThese methods typically combine learnable textual tokens with class tokens as\ninput for models with frozen parameters. However, they often employ a single\nprompt to describe class contexts, failing to capture categories' diverse\nattributes adequately. This study introduces the Partitioned Multi-modal Prompt\n(PMPO), a multi-modal prompting technique that extends the soft prompt from a\nsingle learnable prompt to multiple prompts. Our method divides the visual\nencoder depths and connects learnable prompts to the separated visual depths,\nenabling different prompts to capture the hierarchical contextual depths of\nvisual representations. Furthermore, to maximize the advantages of multi-prompt\nlearning, we incorporate prior information from manually designed templates and\nlearnable multi-prompts, thus improving the generalization capabilities of our\napproach. We evaluate the effectiveness of our approach on three challenging\ntasks: new class generalization, cross-dataset evaluation, and domain\ngeneralization. For instance, our method achieves a $79.28$ harmonic mean,\naveraged over 11 diverse image recognition datasets ($+7.62$ compared to CoOp),\ndemonstrating significant competitiveness compared to state-of-the-art\nprompting methods.\n","authors":["Yingjie Tian","Yiqi Wang","Xianda Guo","Zheng Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2305.06221v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.08924v2","updated":"2023-09-05T01:15:42Z","published":"2023-07-18T01:53:18Z","title":"Learning to Sample Tasks for Meta Learning","summary":" Through experiments on various meta-learning methods, task samplers, and\nfew-shot learning tasks, this paper arrives at three conclusions. Firstly,\nthere are no universal task sampling strategies to guarantee the performance of\nmeta-learning models. Secondly, task diversity can cause the models to either\nunderfit or overfit during training. Lastly, the generalization performance of\nthe models are influenced by task divergence, task entropy, and task\ndifficulty. In response to these findings, we propose a novel task sampler\ncalled Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes\ntask divergence, task entropy, and task difficulty to sample tasks. To optimize\nASr, we rethink and propose a simple and general meta-learning algorithm.\nFinally, a large number of empirical experiments demonstrate the effectiveness\nof the proposed ASr.\n","authors":["Jingyao Wang","Zeen Song","Xingzhe Su","Lingyu Si","Hongwei Dong","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08924v2.pdf","comment":"10 pages, 7 tables, 3 figures"},{"id":"http://arxiv.org/abs/2307.11342v2","updated":"2023-09-05T01:05:50Z","published":"2023-07-21T04:15:02Z","title":"Tuning Pre-trained Model via Moment Probing","summary":" Recently, efficient fine-tuning of large-scale pre-trained models has\nattracted increasing research interests, where linear probing (LP) as a\nfundamental module is involved in exploiting the final representations for\ntask-dependent classification. However, most of the existing methods focus on\nhow to effectively introduce a few of learnable parameters, and little work\npays attention to the commonly used LP module. In this paper, we propose a\nnovel Moment Probing (MP) method to further explore the potential of LP.\nDistinguished from LP which builds a linear classification head based on the\nmean of final features (e.g., word tokens for ViT) or classification tokens,\nour MP performs a linear classifier on feature distribution, which provides the\nstronger representation ability by exploiting richer statistical information\ninherent in features. Specifically, we represent feature distribution by its\ncharacteristic function, which is efficiently approximated by using first- and\nsecond-order moments of features. Furthermore, we propose a multi-head\nconvolutional cross-covariance (MHC$^3$) to compute second-order moments in an\nefficient and effective manner. By considering that MP could affect feature\nlearning, we introduce a partially shared module to learn two recalibrating\nparameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive\nexperiments on ten benchmarks using various models show that our MP\nsignificantly outperforms LP and is competitive with counterparts at less\ntraining cost, while our MP$_{+}$ achieves state-of-the-art performance.\n","authors":["Mingze Gao","Qilong Wang","Zhenyi Lin","Pengfei Zhu","Qinghua Hu","Jingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.11342v2.pdf","comment":"Accepted to ICCV 2023; Project Page:\n https://github.com/mingzeG/Moment-Probing"},{"id":"http://arxiv.org/abs/2309.01875v1","updated":"2023-09-05T00:58:17Z","published":"2023-09-05T00:58:17Z","title":"Gradient Domain Diffusion Models for Image Synthesis","summary":" Diffusion models are getting popular in generative image and video synthesis.\nHowever, due to the diffusion process, they require a large number of steps to\nconverge. To tackle this issue, in this paper, we propose to perform the\ndiffusion process in the gradient domain, where the convergence becomes faster.\nThere are two reasons. First, thanks to the Poisson equation, the gradient\ndomain is mathematically equivalent to the original image domain. Therefore,\neach diffusion step in the image domain has a unique corresponding gradient\ndomain representation. Second, the gradient domain is much sparser than the\nimage domain. As a result, gradient domain diffusion models converge faster.\nSeveral numerical experiments confirm that the gradient domain diffusion models\nare more efficient than the original diffusion models. The proposed method can\nbe applied in a wide range of applications such as image processing, computer\nvision and machine learning tasks.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2309.01875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08600v2","updated":"2023-09-05T00:44:33Z","published":"2023-04-17T20:38:07Z","title":"RS2G: Data-Driven Scene-Graph Extraction and Embedding for Robust\n Autonomous Perception and Scenario Understanding","summary":" Effectively capturing intricate interactions among road users is of critical\nimportance to achieving safe navigation for autonomous vehicles. While graph\nlearning (GL) has emerged as a promising approach to tackle this challenge,\nexisting GL models rely on predefined domain-specific graph extraction rules\nthat often fail in real-world drastically changing scenarios. Additionally,\nthese graph extraction rules severely impede the capability of existing GL\nmethods to generalize knowledge across domains. To address this issue, we\npropose RoadScene2Graph (RS2G), an innovative autonomous scenario understanding\nframework with a novel data-driven graph extraction and modeling approach that\ndynamically captures the diverse relations among road users. Our evaluations\ndemonstrate that on average RS2G outperforms the state-of-the-art (SOTA)\nrule-based graph extraction method by 4.47% and the SOTA deep learning model by\n22.19% in subjective risk assessment. More importantly, RS2G delivers notably\nbetter performance in transferring knowledge gained from simulation\nenvironments to unseen real-world scenarios.\n","authors":["Junyao Wang","Arnav Vaibhav Malawade","Junhong Zhou","Shih-Yuan Yu","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2304.08600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02617v1","updated":"2023-09-05T23:33:39Z","published":"2023-09-05T23:33:39Z","title":"Compressing Vision Transformers for Low-Resource Visual Learning","summary":" Vision transformer (ViT) and its variants have swept through visual learning\nleaderboards and offer state-of-the-art accuracy in tasks such as image\nclassification, object detection, and semantic segmentation by attending to\ndifferent parts of the visual input and capturing long-range spatial\ndependencies. However, these models are large and computation-heavy. For\ninstance, the recently proposed ViT-B model has 86M parameters making it\nimpractical for deployment on resource-constrained devices. As a result, their\ndeployment on mobile and edge scenarios is limited. In our work, we aim to take\na step toward bringing vision transformers to the edge by utilizing popular\nmodel compression techniques such as distillation, pruning, and quantization.\n Our chosen application environment is an unmanned aerial vehicle (UAV) that\nis battery-powered and memory-constrained, carrying a single-board computer on\nthe scale of an NVIDIA Jetson Nano with 4GB of RAM. On the other hand, the UAV\nrequires high accuracy close to that of state-of-the-art ViTs to ensure safe\nobject avoidance in autonomous navigation, or correct localization of humans in\nsearch-and-rescue. Inference latency should also be minimized given the\napplication requirements. Hence, our target is to enable rapid inference of a\nvision transformer on an NVIDIA Jetson Nano (4GB) with minimal accuracy loss.\nThis allows us to deploy ViTs on resource-constrained devices, opening up new\npossibilities in surveillance, environmental monitoring, etc. Our\nimplementation is made available at https://github.com/chensy7/efficient-vit.\n","authors":["Eric Youn","Sai Mitheran J","Sanjana Prabhu","Siyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2309.02617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04869v2","updated":"2023-09-05T22:06:24Z","published":"2023-07-10T19:32:53Z","title":"Fed-CPrompt: Contrastive Prompt for Rehearsal-Free Federated Continual\n Learning","summary":" Federated continual learning (FCL) learns incremental tasks over time from\nconfidential datasets distributed across clients. This paper focuses on\nrehearsal-free FCL, which has severe forgetting issues when learning new tasks\ndue to the lack of access to historical task data. To address this issue, we\npropose Fed-CPrompt based on prompt learning techniques to obtain task-specific\nprompts in a communication-efficient way. Fed-CPrompt introduces two key\ncomponents, asynchronous prompt learning, and contrastive continual loss, to\nhandle asynchronous task arrival and heterogeneous data distributions in FCL,\nrespectively. Extensive experiments demonstrate the effectiveness of\nFed-CPrompt in achieving SOTA rehearsal-free FCL performance.\n","authors":["Gaurav Bagwe","Xiaoyong Yuan","Miao Pan","Lan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.04869v2.pdf","comment":"Accepted by FL-ICML 2023"},{"id":"http://arxiv.org/abs/2309.02596v1","updated":"2023-09-05T21:36:42Z","published":"2023-09-05T21:36:42Z","title":"Self-Supervised Pretraining Improves Performance and Inference\n Efficiency in Multiple Lung Ultrasound Interpretation Tasks","summary":" In this study, we investigated whether self-supervised pretraining could\nproduce a neural network feature extractor applicable to multiple\nclassification tasks in B-mode lung ultrasound analysis. When fine-tuning on\nthree lung ultrasound tasks, pretrained models resulted in an improvement of\nthe average across-task area under the receiver operating curve (AUC) by 0.032\nand 0.061 on local and external test sets respectively. Compact nonlinear\nclassifiers trained on features outputted by a single pretrained model did not\nimprove performance across all tasks; however, they did reduce inference time\nby 49% compared to serial execution of separate fine-tuned models. When\ntraining using 1% of the available labels, pretrained models consistently\noutperformed fully supervised models, with a maximum observed test AUC increase\nof 0.396 for the task of view classification. Overall, the results indicate\nthat self-supervised pretraining is useful for producing initial weights for\nlung ultrasound classifiers.\n","authors":["Blake VanBerlo","Brian Li","Jesse Hoey","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2309.02596v1.pdf","comment":"10 pages, 5 figures, submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2309.02591v1","updated":"2023-09-05T21:27:27Z","published":"2023-09-05T21:27:27Z","title":"Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction\n Tuning","summary":" We present CM3Leon (pronounced \"Chameleon\"), a retrieval-augmented,\ntoken-based, decoder-only multi-modal language model capable of generating and\ninfilling both text and images. CM3Leon uses the CM3 multi-modal architecture\nbut additionally shows the extreme benefits of scaling up and tuning on more\ndiverse instruction-style data. It is the first multi-modal model trained with\na recipe adapted from text-only language models, including a large-scale\nretrieval-augmented pre-training stage and a second multi-task supervised\nfine-tuning (SFT) stage. It is also a general-purpose model that can do both\ntext-to-image and image-to-text generation, allowing us to introduce\nself-contained contrastive decoding methods that produce high-quality outputs.\nExtensive experiments demonstrate that this recipe is highly effective for\nmulti-modal models. CM3Leon achieves state-of-the-art performance in\ntext-to-image generation with 5x less training compute than comparable methods\n(zero-shot MS-COCO FID of 4.88). After SFT, CM3Leon can also demonstrate\nunprecedented levels of controllability in tasks ranging from language-guided\nimage editing to image-controlled generation and segmentation.\n","authors":["Lili Yu","Bowen Shi","Ramakanth Pasunuru","Benjamin Muller","Olga Golovneva","Tianlu Wang","Arun Babu","Binh Tang","Brian Karrer","Shelly Sheynin","Candace Ross","Adam Polyak","Russell Howes","Vasu Sharma","Puxin Xu","Hovhannes Tamoyan","Oron Ashual","Uriel Singer","Shang-Wen Li","Susan Zhang","Richard James","Gargi Ghosh","Yaniv Taigman","Maryam Fazel-Zarandi","Asli Celikyilmaz","Luke Zettlemoyer","Armen Aghajanyan"],"pdf_url":"https://arxiv.org/pdf/2309.02591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02578v1","updated":"2023-09-05T20:58:15Z","published":"2023-09-05T20:58:15Z","title":"Anatomy-Driven Pathology Detection on Chest X-rays","summary":" Pathology detection and delineation enables the automatic interpretation of\nmedical scans such as chest X-rays while providing a high level of\nexplainability to support radiologists in making informed decisions. However,\nannotating pathology bounding boxes is a time-consuming task such that large\npublic datasets for this purpose are scarce. Current approaches thus use weakly\nsupervised object detection to learn the (rough) localization of pathologies\nfrom image-level annotations, which is however limited in performance due to\nthe lack of bounding box supervision. We therefore propose anatomy-driven\npathology detection (ADPD), which uses easy-to-annotate bounding boxes of\nanatomical regions as proxies for pathologies. We study two training\napproaches: supervised training using anatomy-level pathology labels and\nmultiple instance learning (MIL) with image-level pathology labels. Our results\nshow that our anatomy-level training approach outperforms weakly supervised\nmethods and fully supervised detection with limited training samples, and our\nMIL approach is competitive with both baseline approaches, therefore\ndemonstrating the potential of our approach.\n","authors":["Philip Müller","Felix Meissen","Johannes Brandt","Georgios Kaissis","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2309.02578v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.02576v1","updated":"2023-09-05T20:54:41Z","published":"2023-09-05T20:54:41Z","title":"Emphysema Subtyping on Thoracic Computed Tomography Scans using Deep\n Neural Networks","summary":" Accurate identification of emphysema subtypes and severity is crucial for\neffective management of COPD and the study of disease heterogeneity. Manual\nanalysis of emphysema subtypes and severity is laborious and subjective. To\naddress this challenge, we present a deep learning-based approach for\nautomating the Fleischner Society's visual score system for emphysema subtyping\nand severity analysis. We trained and evaluated our algorithm using 9650\nsubjects from the COPDGene study. Our algorithm achieved the predictive\naccuracy at 52\\%, outperforming a previously published method's accuracy of\n45\\%. In addition, the agreement between the predicted scores of our method and\nthe visual scores was good, where the previous method obtained only moderate\nagreement. Our approach employs a regression training strategy to generate\ncategorical labels while simultaneously producing high-resolution localized\nactivation maps for visualizing the network predictions. By leveraging these\ndense activation maps, our method possesses the capability to compute the\npercentage of emphysema involvement per lung in addition to categorical\nseverity scores. Furthermore, the proposed method extends its predictive\ncapabilities beyond centrilobular emphysema to include paraseptal emphysema\nsubtypes.\n","authors":["Weiyi Xie","Colin Jacobs","Jean-Paul Charbonnier","Dirk Jan Slebos","Bram van Ginneken"],"pdf_url":"https://arxiv.org/pdf/2309.02576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02881v3","updated":"2023-09-05T20:44:04Z","published":"2023-07-06T09:36:45Z","title":"Probabilistic and Semantic Descriptions of Image Manifolds and Their\n Applications","summary":" This paper begins with a description of methods for estimating probability\ndensity functions for images that reflects the observation that such data is\nusually constrained to lie in restricted regions of the high-dimensional image\nspace - not every pattern of pixels is an image. It is common to say that\nimages lie on a lower-dimensional manifold in the high-dimensional space.\nHowever, although images may lie on such lower-dimensional manifolds, it is not\nthe case that all points on the manifold have an equal probability of being\nimages. Images are unevenly distributed on the manifold, and our task is to\ndevise ways to model this distribution as a probability distribution. In\npursuing this goal, we consider generative models that are popular in AI and\ncomputer vision community. For our purposes, generative/probabilistic models\nshould have the properties of 1) sample generation: it should be possible to\nsample from this distribution according to the modelled density function, and\n2) probability computation: given a previously unseen sample from the dataset\nof interest, one should be able to compute the probability of the sample, at\nleast up to a normalising constant. To this end, we investigate the use of\nmethods such as normalising flow and diffusion models. We then show that such\nprobabilistic descriptions can be used to construct defences against\nadversarial attacks. In addition to describing the manifold in terms of\ndensity, we also consider how semantic interpretations can be used to describe\npoints on the manifold. To this end, we consider an emergent language framework\nwhich makes use of variational encoders to produce a disentangled\nrepresentation of points that reside on a given manifold. Trajectories between\npoints on a manifold can then be described in terms of evolving semantic\ndescriptions.\n","authors":["Peter Tu","Zhaoyuan Yang","Richard Hartley","Zhiwei Xu","Jing Zhang","Yiwei Fu","Dylan Campbell","Jaskirat Singh","Tianyu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.02881v3.pdf","comment":"24 pages, 17 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.02563v1","updated":"2023-09-05T20:24:27Z","published":"2023-09-05T20:24:27Z","title":"Evaluation Kidney Layer Segmentation on Whole Slide Imaging using\n Convolutional Neural Networks and Transformers","summary":" The segmentation of kidney layer structures, including cortex, outer stripe,\ninner stripe, and inner medulla within human kidney whole slide images (WSI)\nplays an essential role in automated image analysis in renal pathology.\nHowever, the current manual segmentation process proves labor-intensive and\ninfeasible for handling the extensive digital pathology images encountered at a\nlarge scale. In response, the realm of digital renal pathology has seen the\nemergence of deep learning-based methodologies. However, very few, if any, deep\nlearning based approaches have been applied to kidney layer structure\nsegmentation. Addressing this gap, this paper assesses the feasibility of\nperforming deep learning based approaches on kidney layer structure\nsegmetnation. This study employs the representative convolutional neural\nnetwork (CNN) and Transformer segmentation approaches, including Swin-Unet,\nMedical-Transformer, TransUNet, U-Net, PSPNet, and DeepLabv3+. We\nquantitatively evaluated six prevalent deep learning models on renal cortex\nlayer segmentation using mice kidney WSIs. The empirical results stemming from\nour approach exhibit compelling advancements, as evidenced by a decent Mean\nIntersection over Union (mIoU) index. The results demonstrate that Transformer\nmodels generally outperform CNN-based models. By enabling a quantitative\nevaluation of renal cortical structures, deep learning approaches are promising\nto empower these medical professionals to make more informed kidney layer\nsegmentation.\n","authors":["Muhao Liu","Chenyang Qi","Shunxing Bao","Quan Liu","Ruining Deng","Yu Wang","Shilin Zhao","Haichun Yang","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2309.02563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02562v1","updated":"2023-09-05T20:22:26Z","published":"2023-09-05T20:22:26Z","title":"Recurrence-Free Survival Prediction for Anal Squamous Cell Carcinoma\n Chemoradiotherapy using Planning CT-based Radiomics Model","summary":" Objectives: Approximately 30% of non-metastatic anal squamous cell carcinoma\n(ASCC) patients will experience recurrence after chemoradiotherapy (CRT), and\ncurrently available clinical variables are poor predictors of treatment\nresponse. We aimed to develop a model leveraging information extracted from\nradiation pretreatment planning CT to predict recurrence-free survival (RFS) in\nASCC patients after CRT. Methods: Radiomics features were extracted from\nplanning CT images of 96 ASCC patients. Following pre-feature selection, the\noptimal feature set was selected via step-forward feature selection with a\nmultivariate Cox proportional hazard model. The RFS prediction was generated\nfrom a radiomics-clinical combined model based on an optimal feature set with\nfive repeats of five-fold cross validation. The risk stratification ability of\nthe proposed model was evaluated with Kaplan-Meier analysis. Results: Shape-\nand texture-based radiomics features significantly predicted RFS. Compared to a\nclinical-only model, radiomics-clinical combined model achieves better\nperformance in the testing cohort with higher C-index (0.80 vs 0.73) and AUC\n(0.84 vs 0.79 for 1-year RFS, 0.84 vs 0.78 for 2-year RFS, and 0.86 vs 0.83 for\n3-year RFS), leading to distinctive high- and low-risk of recurrence groups\n(p<0.001). Conclusions: A treatment planning CT based radiomics and clinical\ncombined model had improved prognostic performance in predicting RFS for ASCC\npatients treated with CRT as compared to a model using clinical features only.\n","authors":["Shanshan Tang","Kai Wang","David Hein","Gloria Lin","Nina N. Sanford","Jing Wang"],"pdf_url":"https://arxiv.org/pdf/2309.02562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02561v1","updated":"2023-09-05T20:21:03Z","published":"2023-09-05T20:21:03Z","title":"Physically Grounded Vision-Language Models for Robotic Manipulation","summary":" Recent advances in vision-language models (VLMs) have led to improved\nperformance on tasks such as visual question answering and image captioning.\nConsequently, these models are now well-positioned to reason about the physical\nworld, particularly within domains such as robotic manipulation. However,\ncurrent VLMs are limited in their understanding of the physical concepts (e.g.,\nmaterial, fragility) of common objects, which restricts their usefulness for\nrobotic manipulation tasks that involve interaction and physical reasoning\nabout such objects. To address this limitation, we propose PhysObjects, an\nobject-centric dataset of 36.9K crowd-sourced and 417K automated physical\nconcept annotations of common household objects. We demonstrate that\nfine-tuning a VLM on PhysObjects improves its understanding of physical object\nconcepts, by capturing human priors of these concepts from visual appearance.\nWe incorporate this physically-grounded VLM in an interactive framework with a\nlarge language model-based robotic planner, and show improved planning\nperformance on tasks that require reasoning about physical object concepts,\ncompared to baselines that do not leverage physically-grounded VLMs. We\nadditionally illustrate the benefits of our physically-grounded VLM on a real\nrobot, where it improves task success rates. We release our dataset and provide\nfurther details and visualizations of our results at\nhttps://iliad.stanford.edu/pg-vlm/.\n","authors":["Jensen Gao","Bidipta Sarkar","Fei Xia","Ted Xiao","Jiajun Wu","Brian Ichter","Anirudha Majumdar","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2309.02561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02556v1","updated":"2023-09-05T19:45:27Z","published":"2023-09-05T19:45:27Z","title":"Domain Adaptation for Efficiently Fine-tuning Vision Transformer with\n Encrypted Images","summary":" In recent years, deep neural networks (DNNs) trained with transformed data\nhave been applied to various applications such as privacy-preserving learning,\naccess control, and adversarial defenses. However, the use of transformed data\ndecreases the performance of models. Accordingly, in this paper, we propose a\nnovel method for fine-tuning models with transformed images under the use of\nthe vision transformer (ViT). The proposed domain adaptation method does not\ncause the accuracy degradation of models, and it is carried out on the basis of\nthe embedding structure of ViT. In experiments, we confirmed that the proposed\nmethod prevents accuracy degradation even when using encrypted images with the\nCIFAR-10 and CIFAR-100 datasets.\n","authors":["Teru Nagamori","Sayaka Shiota","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2309.02556v1.pdf","comment":"Accepted by APSIPA 2023"},{"id":"http://arxiv.org/abs/2309.02555v1","updated":"2023-09-05T19:45:09Z","published":"2023-09-05T19:45:09Z","title":"A Survey of the Impact of Self-Supervised Pretraining for Diagnostic\n Tasks with Radiological Images","summary":" Self-supervised pretraining has been observed to be effective at improving\nfeature representations for transfer learning, leveraging large amounts of\nunlabelled data. This review summarizes recent research into its usage in\nX-ray, computed tomography, magnetic resonance, and ultrasound imaging,\nconcentrating on studies that compare self-supervised pretraining to fully\nsupervised learning for diagnostic tasks such as classification and\nsegmentation. The most pertinent finding is that self-supervised pretraining\ngenerally improves downstream task performance compared to full supervision,\nmost prominently when unlabelled examples greatly outnumber labelled examples.\nBased on the aggregate evidence, recommendations are provided for practitioners\nconsidering using self-supervised learning. Motivated by limitations identified\nin current research, directions and practices for future study are suggested,\nsuch as integrating clinical knowledge with theoretically justified\nself-supervised learning methods, evaluating on public datasets, growing the\nmodest body of evidence for ultrasound, and characterizing the impact of\nself-supervised pretraining on generalization.\n","authors":["Blake VanBerlo","Jesse Hoey","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2309.02555v1.pdf","comment":"32 pages, 6 figures, a literature survey submitted to BMC Medical\n Imaging"},{"id":"http://arxiv.org/abs/2307.15778v2","updated":"2023-09-05T19:35:25Z","published":"2023-07-28T19:38:13Z","title":"Spherical and Hyperbolic Toric Topology-Based Codes On Graph Embedding\n for Ising MRF Models: Classical and Quantum Topology Machine Learning","summary":" The paper introduces the application of information geometry to describe the\nground states of Ising models by utilizing parity-check matrices of cyclic and\nquasi-cyclic codes on toric and spherical topologies. The approach establishes\na connection between machine learning and error-correcting coding. This\nproposed approach has implications for the development of new embedding methods\nbased on trapping sets. Statistical physics and number geometry applied for\noptimize error-correcting codes, leading to these embedding and sparse\nfactorization methods. The paper establishes a direct connection between DNN\narchitecture and error-correcting coding by demonstrating how state-of-the-art\narchitectures (ChordMixer, Mega, Mega-chunk, CDIL, ...) from the long-range\narena can be equivalent to of block and convolutional LDPC codes (Cage-graph,\nRepeat Accumulate). QC codes correspond to certain types of chemical elements,\nwith the carbon element being represented by the mixed automorphism\nShu-Lin-Fossorier QC-LDPC code. The connections between Belief Propagation and\nthe Permanent, Bethe-Permanent, Nishimori Temperature, and Bethe-Hessian Matrix\nare elaborated upon in detail. The Quantum Approximate Optimization Algorithm\n(QAOA) used in the Sherrington-Kirkpatrick Ising model can be seen as analogous\nto the back-propagation loss function landscape in training DNNs. This\nsimilarity creates a comparable problem with TS pseudo-codeword, resembling the\nbelief propagation method. Additionally, the layer depth in QAOA correlates to\nthe number of decoding belief propagation iterations in the Wiberg decoding\ntree. Overall, this work has the potential to advance multiple fields, from\nInformation Theory, DNN architecture design (sparse and structured prior graph\ntopology), efficient hardware design for Quantum and Classical DPU/TPU (graph,\nquantize and shift register architect.) to Materials Science and beyond.\n","authors":["Vasiliy Usatyuk","Sergey Egorov","Denis Sapozhnikov"],"pdf_url":"https://arxiv.org/pdf/2307.15778v2.pdf","comment":"71 pages, 42 Figures, 1 Table, 1 Appendix. arXiv admin note: text\n overlap with arXiv:2109.08184 by other authors"},{"id":"http://arxiv.org/abs/2305.00147v2","updated":"2023-09-05T19:12:44Z","published":"2023-04-29T01:39:08Z","title":"Visualizing chest X-ray dataset biases using GANs","summary":" Recent work demonstrates that images from various chest X-ray datasets\ncontain visual features that are strongly correlated with protected demographic\nattributes like race and gender. This finding raises issues of fairness, since\nsome of these factors may be used by downstream algorithms for clinical\npredictions. In this work, we propose a framework, using generative adversarial\nnetworks (GANs), to visualize what features are most different between X-rays\nbelonging to two demographic subgroups.\n","authors":["Hao Liang","Kevin Ni","Guha Balakrishnan"],"pdf_url":"https://arxiv.org/pdf/2305.00147v2.pdf","comment":"Medical Imaging with Deep Learning(MIDL) 2023"},{"id":"http://arxiv.org/abs/2308.16741v2","updated":"2023-09-05T18:53:39Z","published":"2023-08-31T13:59:35Z","title":"Socratis: Are large multimodal models emotionally aware?","summary":" Existing emotion prediction benchmarks contain coarse emotion labels which do\nnot consider the diversity of emotions that an image and text can elicit in\nhumans due to various reasons. Learning diverse reactions to multimodal content\nis important as intelligent machines take a central role in generating and\ndelivering content to society. To address this gap, we propose Socratis, a\nsocietal reactions benchmark, where each image-caption (IC) pair is annotated\nwith multiple emotions and the reasons for feeling them. Socratis contains 18K\nfree-form reactions for 980 emotions on 2075 image-caption pairs from 5\nwidely-read news and image-caption (IC) datasets. We benchmark the capability\nof state-of-the-art multimodal large language models to generate the reasons\nfor feeling an emotion given an IC pair. Based on a preliminary human study, we\nobserve that humans prefer human-written reasons over 2 times more often than\nmachine-generated ones. This shows our task is harder than standard generation\ntasks because it starkly contrasts recent findings where humans cannot tell\napart machine vs human-written news articles, for instance. We further see that\ncurrent captioning metrics based on large vision-language models also fail to\ncorrelate with human preferences. We hope that these findings and our benchmark\nwill inspire further research on training emotionally aware models.\n","authors":["Katherine Deng","Arijit Ray","Reuben Tan","Saadia Gabriel","Bryan A. Plummer","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2308.16741v2.pdf","comment":"ICCV 2023 WECIA"},{"id":"http://arxiv.org/abs/2309.02527v1","updated":"2023-09-05T18:40:14Z","published":"2023-09-05T18:40:14Z","title":"A skeletonization algorithm for gradient-based optimization","summary":" The skeleton of a digital image is a compact representation of its topology,\ngeometry, and scale. It has utility in many computer vision applications, such\nas image description, segmentation, and registration. However, skeletonization\nhas only seen limited use in contemporary deep learning solutions. Most\nexisting skeletonization algorithms are not differentiable, making it\nimpossible to integrate them with gradient-based optimization. Compatible\nalgorithms based on morphological operations and neural networks have been\nproposed, but their results often deviate from the geometry and topology of the\ntrue medial axis. This work introduces the first three-dimensional\nskeletonization algorithm that is both compatible with gradient-based\noptimization and preserves an object's topology. Our method is exclusively\nbased on matrix additions and multiplications, convolutional operations, basic\nnon-linear functions, and sampling from a uniform probability distribution,\nallowing it to be easily implemented in any major deep learning library. In\nbenchmarking experiments, we prove the advantages of our skeletonization\nalgorithm compared to non-differentiable, morphological, and\nneural-network-based baselines. Finally, we demonstrate the utility of our\nalgorithm by integrating it with two medical image processing applications that\nuse gradient-based optimization: deep-learning-based blood vessel segmentation,\nand multimodal registration of the mandible in computed tomography and magnetic\nresonance images.\n","authors":["Martin J. Menten","Johannes C. Paetzold","Veronika A. Zimmer","Suprosanna Shit","Ivan Ezhov","Robbie Holland","Monika Probst","Julia A. Schnabel","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2309.02527v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.15233v2","updated":"2023-09-05T18:21:16Z","published":"2023-03-27T14:15:17Z","title":"Text-to-Image Diffusion Models are Zero-Shot Classifiers","summary":" The excellent generative capabilities of text-to-image diffusion models\nsuggest they learn informative representations of image-text data. However,\nwhat knowledge their representations capture is not fully understood, and they\nhave not been thoroughly explored on downstream tasks. We investigate diffusion\nmodels by proposing a method for evaluating them as zero-shot classifiers. The\nkey idea is using a diffusion model's ability to denoise a noised image given a\ntext description of a label as a proxy for that label's likelihood. We apply\nour method to Stable Diffusion and Imagen, using it to probe fine-grained\naspects of the models' knowledge and comparing them with CLIP's zero-shot\nabilities. They perform competitively with CLIP on a wide range of zero-shot\nimage classification datasets. Additionally, they achieve state-of-the-art\nresults on shape/texture bias tests and can successfully perform attribute\nbinding while CLIP cannot. Although generative pre-training is prevalent in\nNLP, visual foundation models often use other methods such as contrastive\nlearning. Based on our findings, we argue that generative pre-training should\nbe explored as a compelling alternative for vision-language tasks.\n","authors":["Kevin Clark","Priyank Jaini"],"pdf_url":"https://arxiv.org/pdf/2303.15233v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05323v2","updated":"2023-09-05T18:03:51Z","published":"2023-01-12T22:33:01Z","title":"Salient Object Detection for Images Taken by People With Vision\n Impairments","summary":" Salient object detection is the task of producing a binary mask for an image\nthat deciphers which pixels belong to the foreground object versus background.\nWe introduce a new salient object detection dataset using images taken by\npeople who are visually impaired who were seeking to better understand their\nsurroundings, which we call VizWiz-SalientObject. Compared to seven existing\ndatasets, VizWiz-SalientObject is the largest (i.e., 32,000 human-annotated\nimages) and contains unique characteristics including a higher prevalence of\ntext in the salient objects (i.e., in 68\\% of images) and salient objects that\noccupy a larger ratio of the images (i.e., on average, $\\sim$50\\% coverage). We\nbenchmarked seven modern salient object detection methods on our dataset and\nfound they struggle most with images featuring salient objects that are large,\nhave less complex boundaries, and lack text as well as for lower quality\nimages. We invite the broader community to work on our new dataset challenge by\npublicly sharing the dataset at\nhttps://vizwiz.org/tasks-and-datasets/salient-object .\n","authors":["Jarek Reynolds","Chandra Kanth Nagesh","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2301.05323v2.pdf","comment":"Computer Vision and Pattern Recognition"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.02322v1","updated":"2023-09-05T15:41:26Z","published":"2023-09-05T15:41:26Z","title":"Fairness of Exposure in Dynamic Recommendation","summary":" Exposure bias is a well-known issue in recommender systems where the exposure\nis not fairly distributed among items in the recommendation results. This is\nespecially problematic when bias is amplified over time as a few items (e.g.,\npopular ones) are repeatedly over-represented in recommendation lists and\nusers' interactions with those items will amplify bias towards those items over\ntime resulting in a feedback loop. This issue has been extensively studied in\nthe literature in static recommendation environment where a single round of\nrecommendation result is processed to improve the exposure fairness. However,\nless work has been done on addressing exposure bias in a dynamic recommendation\nsetting where the system is operating over time, the recommendation model and\nthe input data are dynamically updated with ongoing user feedback on\nrecommended items at each round. In this paper, we study exposure bias in a\ndynamic recommendation setting. Our goal is to show that existing bias\nmitigation methods that are designed to operate in a static recommendation\nsetting are unable to satisfy fairness of exposure for items in long run. In\nparticular, we empirically study one of these methods and show that repeatedly\napplying this method fails to fairly distribute exposure among items in long\nrun. To address this limitation, we show how this method can be adapted to\neffectively operate in a dynamic recommendation setting and achieve exposure\nfairness for items in long run. Experiments on a real-world dataset confirm\nthat our solution is superior in achieving long-term exposure fairness for the\nitems while maintaining the recommendation accuracy.\n","authors":["Masoud Mansoury","Bamshad Mobasher"],"pdf_url":"https://arxiv.org/pdf/2309.02322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16609v2","updated":"2023-09-05T14:46:38Z","published":"2023-08-31T10:12:32Z","title":"Towards Long-Tailed Recognition for Graph Classification via\n Collaborative Experts","summary":" Graph classification, aiming at learning the graph-level representations for\neffective class assignments, has received outstanding achievements, which\nheavily relies on high-quality datasets that have balanced class distribution.\nIn fact, most real-world graph data naturally presents a long-tailed form,\nwhere the head classes occupy much more samples than the tail classes, it thus\nis essential to study the graph-level classification over long-tailed data\nwhile still remaining largely unexplored. However, most existing long-tailed\nlearning methods in visions fail to jointly optimize the representation\nlearning and classifier training, as well as neglect the mining of the\nhard-to-classify classes. Directly applying existing methods to graphs may lead\nto sub-optimal performance, since the model trained on graphs would be more\nsensitive to the long-tailed distribution due to the complex topological\ncharacteristics. Hence, in this paper, we propose a novel long-tailed\ngraph-level classification framework via Collaborative Multi-expert Learning\n(CoMe) to tackle the problem. To equilibrate the contributions of head and tail\nclasses, we first develop balanced contrastive learning from the view of\nrepresentation learning, and then design an individual-expert classifier\ntraining based on hard class mining. In addition, we execute gated fusion and\ndisentangled knowledge distillation among the multiple experts to promote the\ncollaboration in a multi-expert framework. Comprehensive experiments are\nperformed on seven widely-used benchmark datasets to demonstrate the\nsuperiority of our method CoMe over state-of-the-art baselines.\n","authors":["Siyu Yi","Zhengyang Mao","Wei Ju","Yongdao Zhou","Luchen Liu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16609v2.pdf","comment":"Accepted by IEEE Transactions on Big Data (TBD 2024)"},{"id":"http://arxiv.org/abs/2309.02251v1","updated":"2023-09-05T14:04:00Z","published":"2023-09-05T14:04:00Z","title":"STGIN: Spatial-Temporal Graph Interaction Network for Large-scale POI\n Recommendation","summary":" In Location-Based Services, Point-Of-Interest(POI) recommendation plays a\ncrucial role in both user experience and business opportunities. Graph neural\nnetworks have been proven effective in providing personalized POI\nrecommendation services. However, there are still two critical challenges.\nFirst, existing graph models attempt to capture users' diversified interests\nthrough a unified graph, which limits their ability to express interests in\nvarious spatial-temporal contexts. Second, the efficiency limitations of graph\nconstruction and graph sampling in large-scale systems make it difficult to\nadapt quickly to new real-time interests. To tackle the above challenges, we\npropose a novel Spatial-Temporal Graph Interaction Network. Specifically, we\nconstruct subgraphs of spatial, temporal, spatial-temporal, and global views\nrespectively to precisely characterize the user's interests in various\ncontexts. In addition, we design an industry-friendly framework to track the\nuser's latest interests. Extensive experiments on the real-world dataset show\nthat our method outperforms state-of-the-art models. This work has been\nsuccessfully deployed in a large e-commerce platform, delivering a 1.1% CTR and\n6.3% RPM improvement.\n","authors":["Shaohua Liu","Yu Qi","Gen Li","Mingjian Chen","Teng Zhang","Jia Cheng","Jun Lei"],"pdf_url":"https://arxiv.org/pdf/2309.02251v1.pdf","comment":"accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2306.00936v2","updated":"2023-09-05T13:36:27Z","published":"2023-06-01T17:39:40Z","title":"AMR4NLI: Interpretable and robust NLI measures from semantic graphs","summary":" The task of natural language inference (NLI) asks whether a given premise\n(expressed in NL) entails a given NL hypothesis. NLI benchmarks contain human\nratings of entailment, but the meaning relationships driving these ratings are\nnot formalized. Can the underlying sentence pair relationships be made more\nexplicit in an interpretable yet robust fashion? We compare semantic structures\nto represent premise and hypothesis, including sets of contextualized\nembeddings and semantic graphs (Abstract Meaning Representations), and measure\nwhether the hypothesis is a semantic substructure of the premise, utilizing\ninterpretable metrics. Our evaluation on three English benchmarks finds value\nin both contextualized embeddings and semantic graphs; moreover, they provide\ncomplementary signals, and can be leveraged together in a hybrid model.\n","authors":["Juri Opitz","Shira Wein","Julius Steen","Anette Frank","Nathan Schneider"],"pdf_url":"https://arxiv.org/pdf/2306.00936v2.pdf","comment":"International Conference on Computational Semantics (IWCS 2023); v2\n fixes an imprecise sentence below Eq. 5"},{"id":"http://arxiv.org/abs/2308.11127v2","updated":"2023-09-05T13:35:04Z","published":"2023-08-22T02:17:34Z","title":"How Expressive are Graph Neural Networks in Recommendation?","summary":" Graph Neural Networks (GNNs) have demonstrated superior performance on\nvarious graph learning tasks, including recommendation, where they leverage\nuser-item collaborative filtering signals in graphs. However, theoretical\nformulations of their capability are scarce, despite their empirical\neffectiveness in state-of-the-art recommender models. Recently, research has\nexplored the expressiveness of GNNs in general, demonstrating that message\npassing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that\nGNNs combined with random node initialization are universal. Nevertheless, the\nconcept of \"expressiveness\" for GNNs remains vaguely defined. Most existing\nworks adopt the graph isomorphism test as the metric of expressiveness, but\nthis graph-level task may not effectively assess a model's ability in\nrecommendation, where the objective is to distinguish nodes of different\ncloseness. In this paper, we provide a comprehensive theoretical analysis of\nthe expressiveness of GNNs in recommendation, considering three levels of\nexpressiveness metrics: graph isomorphism (graph-level), node automorphism\n(node-level), and topological closeness (link-level). We propose the\ntopological closeness metric to evaluate GNNs' ability to capture the\nstructural distance between nodes, which aligns closely with the objective of\nrecommendation. To validate the effectiveness of this new metric in evaluating\nrecommendation performance, we introduce a learning-less GNN algorithm that is\noptimal on the new metric and can be optimal on the node-level metric with\nsuitable modification. We conduct extensive experiments comparing the proposed\nalgorithm against various types of state-of-the-art GNN models to explore the\nexplainability of the new metric in the recommendation task. For\nreproducibility, implementation codes are available at\nhttps://github.com/HKUDS/GTE.\n","authors":["Xuheng Cai","Lianghao Xia","Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11127v2.pdf","comment":"32nd ACM International Conference on Information and Knowledge\n Management (CIKM) 2023"},{"id":"http://arxiv.org/abs/2309.02094v1","updated":"2023-09-05T10:00:33Z","published":"2023-09-05T10:00:33Z","title":"TensorBank:Tensor Lakehouse for Foundation Model Training","summary":" Storing and streaming high dimensional data for foundation model training\nbecame a critical requirement with the rise of foundation models beyond natural\nlanguage. In this paper we introduce TensorBank, a petabyte scale tensor\nlakehouse capable of streaming tensors from Cloud Object Store (COS) to GPU\nmemory at wire speed based on complex relational queries. We use Hierarchical\nStatistical Indices (HSI) for query acceleration. Our architecture allows to\ndirectly address tensors on block level using HTTP range reads. Once in GPU\nmemory, data can be transformed using PyTorch transforms. We provide a generic\nPyTorch dataset type with a corresponding dataset factory translating\nrelational queries and requested transformations as an instance. By making use\nof the HSI, irrelevant blocks can be skipped without reading them as those\nindices contain statistics on their content at different hierarchical\nresolution levels. This is an opinionated architecture powered by open\nstandards and making heavy use of open-source technology. Although, hardened\nfor production use using geospatial-temporal data, this architecture\ngeneralizes to other use case like computer vision, computational neuroscience,\nbiological sequence analysis and more.\n","authors":["Romeo Kienzler","Benedikt Blumenstiel","Zoltan Arnold Nagy","S. Karthik Mukkavilli","Johannes Schmude","Marcus Freitag","Michael Behrendt","Daniel Salles Civitarese","Hendrik Hamann"],"pdf_url":"https://arxiv.org/pdf/2309.02094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07946v3","updated":"2023-09-05T09:39:21Z","published":"2023-06-02T14:47:56Z","title":"STUDY: Socially Aware Temporally Causal Decoder Recommender Systems","summary":" Recommender systems are widely used to help people find items that are\ntailored to their interests. These interests are often influenced by social\nnetworks, making it important to use social network information effectively in\nrecommender systems. This is especially true for demographic groups with\ninterests that differ from the majority. This paper introduces STUDY, a\nSocially-aware Temporally caUsal Decoder recommender sYstem. STUDY introduces a\nnew socially-aware recommender system architecture that is significantly more\nefficient to learn and train than existing methods. STUDY performs joint\ninference over socially connected groups in a single forward pass of a modified\ntransformer decoder network. We demonstrate the benefits of STUDY in the\nrecommendation of books for students who are dyslexic, or struggling readers.\nDyslexic students often have difficulty engaging with reading material, making\nit critical to recommend books that are tailored to their interests. We worked\nwith our non-profit partner Learning Ally to evaluate STUDY on a dataset of\nstruggling readers. STUDY was able to generate recommendations that more\naccurately predicted student engagement, when compared with existing methods.\n","authors":["Eltayeb Ahmed","Diana Mincu","Lauren Harrell","Katherine Heller","Subhrajit Roy"],"pdf_url":"https://arxiv.org/pdf/2306.07946v3.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.02064v1","updated":"2023-09-05T09:06:34Z","published":"2023-09-05T09:06:34Z","title":"MvFS: Multi-view Feature Selection for Recommender System","summary":" Feature selection, which is a technique to select key features in recommender\nsystems, has received increasing research attention. Recently, Adaptive Feature\nSelection (AdaFS) has shown remarkable performance by adaptively selecting\nfeatures for each data instance, considering that the importance of a given\nfeature field can vary significantly across data. However, this method still\nhas limitations in that its selection process could be easily biased to major\nfeatures that frequently occur. To address these problems, we propose\nMulti-view Feature Selection (MvFS), which selects informative features for\neach instance more effectively. Most importantly, MvFS employs a multi-view\nnetwork consisting of multiple sub-networks, each of which learns to measure\nthe feature importance of a part of data with different feature patterns. By\ndoing so, MvFS promotes a more balanced feature selection process mitigating\nthe bias problem towards dominant patterns. Moreover, MvFS adopts an effective\nimportance score modeling strategy which is applied independently to each field\nwithout incurring dependency among features. Experimental results on real-world\ndatasets demonstrate the effectiveness of MvFS compared to state-of-the-art\nbaselines.\n","authors":["Youngjune Lee","Yeongjong Jeong","Keunchan Park","SeongKu Kang"],"pdf_url":"https://arxiv.org/pdf/2309.02064v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2109.03459v2","updated":"2023-09-05T09:06:10Z","published":"2021-09-08T07:00:45Z","title":"Dual Correction Strategy for Ranking Distillation in Top-N Recommender\n System","summary":" Knowledge Distillation (KD), which transfers the knowledge of a well-trained\nlarge model (teacher) to a small model (student), has become an important area\nof research for practical deployment of recommender systems. Recently, Relaxed\nRanking Distillation (RRD) has shown that distilling the ranking information in\nthe recommendation list significantly improves the performance. However, the\nmethod still has limitations in that 1) it does not fully utilize the\nprediction errors of the student model, which makes the training not fully\nefficient, and 2) it only distills the user-side ranking information, which\nprovides an insufficient view under the sparse implicit feedback. This paper\npresents Dual Correction strategy for Distillation (DCD), which transfers the\nranking information from the teacher model to the student model in a more\nefficient manner. Most importantly, DCD uses the discrepancy between the\nteacher model and the student model predictions to decide which knowledge to be\ndistilled. By doing so, DCD essentially provides the learning guidance tailored\nto \"correcting\" what the student model has failed to accurately predict. This\nprocess is applied for transferring the ranking information from the user-side\nas well as the item-side to address sparse implicit user feedback. Our\nexperiments show that the proposed method outperforms the state-of-the-art\nbaselines, and ablation studies validate the effectiveness of each component.\n","authors":["Youngjune Lee","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2109.03459v2.pdf","comment":"CIKM 2021"},{"id":"http://arxiv.org/abs/2309.02061v1","updated":"2023-09-05T09:01:47Z","published":"2023-09-05T09:01:47Z","title":"Scenario-Aware Hierarchical Dynamic Network for Multi-Scenario\n Recommendation","summary":" Click-Through Rate (CTR) prediction is a fundamental technique in\nrecommendation and advertising systems. Recent studies have shown that\nimplementing multi-scenario recommendations contributes to strengthening\ninformation sharing and improving overall performance. However, existing\nmulti-scenario models only consider coarse-grained explicit scenario modeling\nthat depends on pre-defined scenario identification from manual prior rules,\nwhich is biased and sub-optimal. To address these limitations, we propose a\nScenario-Aware Hierarchical Dynamic Network for Multi-Scenario Recommendations\n(HierRec), which perceives implicit patterns adaptively and conducts explicit\nand implicit scenario modeling jointly. In particular, HierRec designs a basic\nscenario-oriented module based on the dynamic weight to capture\nscenario-specific information. Then the hierarchical explicit and implicit\nscenario-aware modules are proposed to model hybrid-grained scenario\ninformation. The multi-head implicit modeling design contributes to perceiving\ndistinctive patterns from different perspectives. Our experiments on two public\ndatasets and real-world industrial applications on a mainstream online\nadvertising platform demonstrate that our HierRec outperforms existing models\nsignificantly.\n","authors":["Jingtong Gao","Bo Chen","Menghui Zhu","Xiangyu Zhao","Xiaopeng Li","Yuhao Wang","Yichao Wang","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2309.02061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02057v1","updated":"2023-09-05T08:58:46Z","published":"2023-09-05T08:58:46Z","title":"Robust Recommender System: A Survey and Future Directions","summary":" With the rapid growth of information, recommender systems have become\nintegral for providing personalized suggestions and overcoming information\noverload. However, their practical deployment often encounters \"dirty\" data,\nwhere noise or malicious information can lead to abnormal recommendations.\nResearch on improving recommender systems' robustness against such dirty data\nhas thus gained significant attention. This survey provides a comprehensive\nreview of recent work on recommender systems' robustness. We first present a\ntaxonomy to organize current techniques for withstanding malicious attacks and\nnatural noise. We then explore state-of-the-art methods in each category,\nincluding fraudster detection, adversarial training, certifiable robust\ntraining against malicious attacks, and regularization, purification,\nself-supervised learning against natural noise. Additionally, we summarize\nevaluation metrics and common datasets used to assess robustness. We discuss\nrobustness across varying recommendation scenarios and its interplay with other\nproperties like accuracy, interpretability, privacy, and fairness. Finally, we\ndelve into open issues and future research directions in this emerging field.\nOur goal is to equip readers with a holistic understanding of robust\nrecommender systems and spotlight pathways for future research and development.\n","authors":["Kaike Zhang","Qi Cao","Fei Sun","Yunfan Wu","Shuchang Tao","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.02057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02052v1","updated":"2023-09-05T08:52:46Z","published":"2023-09-05T08:52:46Z","title":"Towards Individual and Multistakeholder Fairness in Tourism Recommender\n Systems","summary":" This position paper summarizes our published review on individual and\nmultistakeholder fairness in Tourism Recommender Systems (TRS). Recently, there\nhas been growing attention to fairness considerations in recommender systems\n(RS). It has been acknowledged in research that fairness in RS is often closely\ntied to the presence of multiple stakeholders, such as end users, item\nproviders, and platforms, as it raises concerns for the fair treatment of all\nparties involved. Hence, fairness in RS is a multi-faceted concept that\nrequires consideration of the perspectives and needs of the different\nstakeholders to ensure fair outcomes for them. However, there may often be\ninstances where achieving the goals of one stakeholder could conflict with\nthose of another, resulting in trade-offs.\n In this paper, we emphasized addressing the unique challenges of ensuring\nfairness in RS within the tourism domain. We aimed to discuss potential\nstrategies for mitigating the aforementioned challenges and examine the\napplicability of solutions from other domains to tackle fairness issues in\ntourism. By exploring cross-domain approaches and strategies for incorporating\nS-Fairness, we can uncover valuable insights and determine how these solutions\ncan be adapted and implemented effectively in the context of tourism to enhance\nfairness in RS.\n","authors":["Ashmi Banerjee","Paromita Banik","Wolfgang Wörndl"],"pdf_url":"https://arxiv.org/pdf/2309.02052v1.pdf","comment":"Position Paper for FAcctRec 2023 at RecSys 2023"},{"id":"http://arxiv.org/abs/2305.04891v3","updated":"2023-09-05T07:24:00Z","published":"2023-05-03T12:34:45Z","title":"DELTA: Dynamic Embedding Learning with Truncated Conscious Attention for\n CTR Prediction","summary":" Click-Through Rate (CTR) prediction is a pivotal task in product and content\nrecommendation, where learning effective feature embeddings is of great\nsignificance. However, traditional methods typically learn fixed feature\nrepresentations without dynamically refining feature representations according\nto the context information, leading to suboptimal performance. Some recent\napproaches attempt to address this issue by learning bit-wise weights or\naugmented embeddings for feature representations, but suffer from uninformative\nor redundant features in the context. To tackle this problem, inspired by the\nGlobal Workspace Theory in conscious processing, which posits that only a\nspecific subset of the product features are pertinent while the rest can be\nnoisy and even detrimental to human-click behaviors, we propose a CTR model\nthat enables Dynamic Embedding Learning with Truncated Conscious Attention for\nCTR prediction, termed DELTA. DELTA contains two key components: (I) conscious\ntruncation module (CTM), which utilizes curriculum learning to apply adaptive\ntruncation on attention weights to select the most critical feature in the\ncontext; (II) explicit embedding optimization (EEO), which applies an auxiliary\ntask during training that directly and independently propagates the gradient\nfrom the loss layer to the embedding layer, thereby optimizing the embedding\nexplicitly via linear feature crossing. Extensive experiments on five\nchallenging CTR datasets demonstrate that DELTA achieves new state-of-art\nperformance among current CTR methods.\n","authors":["Chen Zhu","Liang Du","Hong Chen","Shuang Zhao","Zixun Sun","Xin Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.04891v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02841v3","updated":"2023-09-05T03:00:13Z","published":"2023-06-05T12:46:40Z","title":"CTRL: Connect Collaborative and Language Model for CTR Prediction","summary":" Traditional click-through rate (CTR) prediction models convert the tabular\ndata into one-hot vectors and leverage the collaborative relations among\nfeatures for inferring user's preference over items. This modeling paradigm\ndiscards essential semantic information. Though some works like P5 and M6-Rec\nhave explored the potential of using Pre-trained Language Models (PLMs) to\nextract semantic signals for CTR prediction, they are computationally expensive\nand suffer from low efficiency. Besides, the beneficial collaborative relations\nare not considered, hindering the recommendation performance. To solve these\nproblems, in this paper, we propose a novel framework \\textbf{CTRL}, which is\nindustrial friendly and model-agnostic with superior inference efficiency.\nSpecifically, the original tabular data is first converted into textual data.\nBoth tabular data and converted textual data are regarded as two different\nmodalities and are separately fed into the collaborative CTR model and\npre-trained language model. A cross-modal knowledge alignment procedure is\nperformed to fine-grained align and integrate the collaborative and semantic\nsignals, and the lightweight collaborative model can be deployed online for\nefficient serving after fine-tuned with supervised signals. Experimental\nresults on three public datasets show that CTRL outperforms the\nstate-of-the-art (SOTA) CTR models significantly. Moreover, we further verify\nits effectiveness on a large-scale industrial recommender system.\n","authors":["Xiangyang Li","Bo Chen","Lu Hou","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2306.02841v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11963v3","updated":"2023-09-05T01:57:09Z","published":"2023-06-21T01:22:43Z","title":"Multimodality Fusion for Smart Healthcare: a Journey from Data,\n Information, Knowledge to Wisdom","summary":" Multimodal medical data fusion has emerged as a transformative approach in\nsmart healthcare, enabling a comprehensive understanding of patient health and\npersonalized treatment plans. In this paper, a journey from data, information,\nand knowledge to wisdom (DIKW) is explored through multimodal fusion for smart\nhealthcare. A comprehensive review of multimodal medical data fusion focuses on\nthe integration of various data modalities are presented. It explores different\napproaches such as Feature selection, Rule-based systems, Machine learning,\nDeep learning, and Natural Language Processing for fusing and analyzing\nmultimodal data. The paper also highlights the challenges associated with\nmultimodal fusion in healthcare. By synthesizing the reviewed frameworks and\ninsights, a generic framework for multimodal medical data fusion is proposed\nwhile aligning with the DIKW mechanism. Moreover, it discusses future\ndirections aligned with the four pillars of healthcare: Predictive, Preventive,\nPersonalized, and Participatory approaches based on the DIKW and the generic\nframework. The components from this comprehensive survey form the foundation\nfor the successful implementation of multimodal fusion in smart healthcare. The\nfindings of this survey can guide researchers and practitioners in leveraging\nthe power of multimodal fusion with the approaches to revolutionize healthcare\nand improve patient outcomes.\n","authors":["Thanveer Shaik","Xiaohui Tao","Lin Li","Haoran Xie","Juan D. Velásquez"],"pdf_url":"https://arxiv.org/pdf/2306.11963v3.pdf","comment":"This work has been submitted to the ELSEVIER for possible\n publication. Copyright may be transferred without notice, after which this\n version may no longer be accessible"},{"id":"http://arxiv.org/abs/2309.02550v1","updated":"2023-09-05T19:36:22Z","published":"2023-09-05T19:36:22Z","title":"Tidying Up the Conversational Recommender Systems' Biases","summary":" The growing popularity of language models has sparked interest in\nconversational recommender systems (CRS) within both industry and research\ncircles. However, concerns regarding biases in these systems have emerged.\nWhile individual components of CRS have been subject to bias studies, a\nliterature gap remains in understanding specific biases unique to CRS and how\nthese biases may be amplified or reduced when integrated into complex CRS\nmodels. In this paper, we provide a concise review of biases in CRS by\nsurveying recent literature. We examine the presence of biases throughout the\nsystem's pipeline and consider the challenges that arise from combining\nmultiple models. Our study investigates biases in classic recommender systems\nand their relevance to CRS. Moreover, we address specific biases in CRS,\nconsidering variations with and without natural language understanding\ncapabilities, along with biases related to dialogue systems and language\nmodels. Through our findings, we highlight the necessity of adopting a holistic\nperspective when dealing with biases in complex CRS models.\n","authors":["Armin Moradi","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2309.02550v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.02435v1","updated":"2023-09-05T17:59:45Z","published":"2023-09-05T17:59:45Z","title":"Efficient RL via Disentangled Environment and Agent Representations","summary":" Agents that are aware of the separation between themselves and their\nenvironments can leverage this understanding to form effective representations\nof visual input. We propose an approach for learning such structured\nrepresentations for RL algorithms, using visual knowledge of the agent, such as\nits shape or mask, which is often inexpensive to obtain. This is incorporated\ninto the RL objective using a simple auxiliary loss. We show that our method,\nStructured Environment-Agent Representations, outperforms state-of-the-art\nmodel-free approaches over 18 different challenging visual simulation\nenvironments spanning 5 different robots. Website at https://sear-rl.github.io/\n","authors":["Kevin Gmelin","Shikhar Bahl","Russell Mendonca","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2309.02435v1.pdf","comment":"ICML 2023. Website at https://sear-rl.github.io/"},{"id":"http://arxiv.org/abs/2309.02429v1","updated":"2023-09-05T17:57:31Z","published":"2023-09-05T17:57:31Z","title":"Building a Winning Team: Selecting Source Model Ensembles using a\n Submodular Transferability Estimation Approach","summary":" Estimating the transferability of publicly available pretrained models to a\ntarget task has assumed an important place for transfer learning tasks in\nrecent years. Existing efforts propose metrics that allow a user to choose one\nmodel from a pool of pre-trained models without having to fine-tune each model\nindividually and identify one explicitly. With the growth in the number of\navailable pre-trained models and the popularity of model ensembles, it also\nbecomes essential to study the transferability of multiple-source models for a\ngiven target task. The few existing efforts study transferability in such\nmulti-source ensemble settings using just the outputs of the classification\nlayer and neglect possible domain or task mismatch. Moreover, they overlook the\nmost important factor while selecting the source models, viz., the cohesiveness\nfactor between them, which can impact the performance and confidence in the\nprediction of the ensemble. To address these gaps, we propose a novel Optimal\ntranSport-based suBmOdular tRaNsferability metric (OSBORN) to estimate the\ntransferability of an ensemble of models to a downstream task. OSBORN\ncollectively accounts for image domain difference, task difference, and\ncohesiveness of models in the ensemble to provide reliable estimates of\ntransferability. We gauge the performance of OSBORN on both image\nclassification and semantic segmentation tasks. Our setup includes 28 source\ndatasets, 11 target datasets, 5 model architectures, and 2 pre-training\nmethods. We benchmark our method against current state-of-the-art metrics\nMS-LEEP and E-LEEP, and outperform them consistently using the proposed\napproach.\n","authors":["Vimal K B","Saketh Bachu","Tanmay Garg","Niveditha Lakshmi Narasimhan","Raghavan Konuru","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2309.02429v1.pdf","comment":"To appear at ICCV 2023"},{"id":"http://arxiv.org/abs/2302.03023v4","updated":"2023-09-05T17:56:42Z","published":"2023-02-06T18:58:38Z","title":"V1T: large-scale mouse V1 response prediction using a Vision Transformer","summary":" Accurate predictive models of the visual cortex neural response to natural\nvisual stimuli remain a challenge in computational neuroscience. In this work,\nwe introduce V1T, a novel Vision Transformer based architecture that learns a\nshared visual and behavioral representation across animals. We evaluate our\nmodel on two large datasets recorded from mouse primary visual cortex and\noutperform previous convolution-based models by more than 12.7% in prediction\nperformance. Moreover, we show that the self-attention weights learned by the\nTransformer correlate with the population receptive fields. Our model thus sets\na new benchmark for neural response prediction and can be used jointly with\nbehavioral and neural recordings to reveal meaningful characteristic features\nof the visual cortex.\n","authors":["Bryan M. Li","Isabel M. Cornacchia","Nathalie L. Rochefort","Arno Onken"],"pdf_url":"https://arxiv.org/pdf/2302.03023v4.pdf","comment":"updated references and added link to code repository; add analysis on\n generalization and visualize aRFs; updated with TMLR publication"},{"id":"http://arxiv.org/abs/2309.02428v1","updated":"2023-09-05T17:56:22Z","published":"2023-09-05T17:56:22Z","title":"Tensorization: Creating and Utilising Multidimensional Datasets for\n Multiway Analysis and Tensorised Deep Neural Networks -- Python Tutorial and\n Survey","summary":" As the size and complexity of data continue to increase, the need for\nefficient and effective analysis methods becomes ever more crucial.\nTensorization, the process of converting 2-dimensional datasets into\nmultidimensional structures, has emerged as a promising approach for multiway\nanalysis methods. This paper explores the steps involved in tensorization,\nmultidimensional data sources, various multiway analysis methods employed, and\nthe benefits of these approaches. A small example of Blind Source Separation\n(BSS) is presented comparing 2-dimensional algorithms and a multiway algorithm\nin Python. Results indicate that multiway analysis is more expressive.\nAdditionally, tensorization techniques aid in compressing deep learning models\nby reducing the number of required parameters while enhancing the expression of\nrelationships across dimensions. A survey of the multi-away analysis methods\nand integration with various Deep Neural Networks models is presented using\ncase studies in different domains.\n","authors":["Manal Helal"],"pdf_url":"https://arxiv.org/pdf/2309.02428v1.pdf","comment":"29 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.02427v1","updated":"2023-09-05T17:56:20Z","published":"2023-09-05T17:56:20Z","title":"Cognitive Architectures for Language Agents","summary":" Recent efforts have incorporated large language models (LLMs) with external\nresources (e.g., the Internet) or internal control flows (e.g., prompt\nchaining) for tasks requiring grounding or reasoning. However, these efforts\nhave largely been piecemeal, lacking a systematic framework for constructing a\nfully-fledged language agent. To address this challenge, we draw on the rich\nhistory of agent design in symbolic artificial intelligence to develop a\nblueprint for a new wave of cognitive language agents. We first show that LLMs\nhave many of the same properties as production systems, and recent efforts to\nimprove their grounding or reasoning mirror the development of cognitive\narchitectures built around production systems. We then propose Cognitive\nArchitectures for Language Agents (CoALA), a conceptual framework to\nsystematize diverse methods for LLM-based reasoning, grounding, learning, and\ndecision making as instantiations of language agents in the framework. Finally,\nwe use the CoALA framework to highlight gaps and propose actionable directions\ntoward more capable language agents in the future.\n","authors":["Theodore Sumers","Shunyu Yao","Karthik Narasimhan","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2309.02427v1.pdf","comment":"16 pages of main content, 10 pages of references, 5 figures. Equal\n contribution among the first two authors, order decided by coin flip. A\n CoALA-based repo of recent work on language agents:\n https://github.com/ysymyth/awesome-language-agents"},{"id":"http://arxiv.org/abs/2309.02426v1","updated":"2023-09-05T17:54:37Z","published":"2023-09-05T17:54:37Z","title":"Monotone Tree-Based GAMI Models by Adapting XGBoost","summary":" Recent papers have used machine learning architecture to fit low-order\nfunctional ANOVA models with main effects and second-order interactions. These\nGAMI (GAM + Interaction) models are directly interpretable as the functional\nmain effects and interactions can be easily plotted and visualized.\nUnfortunately, it is not easy to incorporate the monotonicity requirement into\nthe existing GAMI models based on boosted trees, such as EBM (Lou et al. 2013)\nand GAMI-Lin-T (Hu et al. 2022). This paper considers models of the form\n$f(x)=\\sum_{j,k}f_{j,k}(x_j, x_k)$ and develops monotone tree-based GAMI\nmodels, called monotone GAMI-Tree, by adapting the XGBoost algorithm. It is\nstraightforward to fit a monotone model to $f(x)$ using the options in XGBoost.\nHowever, the fitted model is still a black box. We take a different approach:\ni) use a filtering technique to determine the important interactions, ii) fit a\nmonotone XGBoost algorithm with the selected interactions, and finally iii)\nparse and purify the results to get a monotone GAMI model. Simulated datasets\nare used to demonstrate the behaviors of mono-GAMI-Tree and EBM, both of which\nuse piecewise constant fits. Note that the monotonicity requirement is for the\nfull model. Under certain situations, the main effects will also be monotone.\nBut, as seen in the examples, the interactions will not be monotone.\n","authors":["Linwei Hu","Soroush Aramideh","Jie Chen","Vijayan N. Nair"],"pdf_url":"https://arxiv.org/pdf/2309.02426v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2309.02425v1","updated":"2023-09-05T17:53:10Z","published":"2023-09-05T17:53:10Z","title":"On the Minimax Regret in Online Ranking with Top-k Feedback","summary":" In online ranking, a learning algorithm sequentially ranks a set of items and\nreceives feedback on its ranking in the form of relevance scores. Since\nobtaining relevance scores typically involves human annotation, it is of great\ninterest to consider a partial feedback setting where feedback is restricted to\nthe top-$k$ items in the rankings. Chaudhuri and Tewari [2017] developed a\nframework to analyze online ranking algorithms with top $k$ feedback. A key\nelement in their work was the use of techniques from partial monitoring. In\nthis paper, we further investigate online ranking with top $k$ feedback and\nsolve some open problems posed by Chaudhuri and Tewari [2017]. We provide a\nfull characterization of minimax regret rates with the top $k$ feedback model\nfor all $k$ and for the following ranking performance measures: Pairwise Loss,\nDiscounted Cumulative Gain, and Precision@n. In addition, we give an efficient\nalgorithm that achieves the minimax regret rate for Precision@n.\n","authors":["Mingyuan Zhang","Ambuj Tewari"],"pdf_url":"https://arxiv.org/pdf/2309.02425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16458v2","updated":"2023-09-05T17:51:16Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained language models like ChatGPT have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks. Moreover, in bioinformatics, generating\nfunctional programs poses additional notable challenges due to the amount of\ndomain knowledge, the need for complicated data operations, and intricate\nfunctional dependencies between the operations. Here, we present BioCoder, a\nbenchmark developed to evaluate existing pre-trained models in generating\nbioinformatics code. In relation to function-code generation, BioCoder covers\npotential package dependencies, class declarations, and global variables. It\nincorporates 1026 functions and 1243 methods in Python and Java from GitHub and\n253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing\nframework for evaluation, and we have applied it to evaluate many models\nincluding InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+,\nInstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes\nthe importance of domain knowledge, pragmatic code generation, and contextual\nunderstanding. Our dataset, benchmark, Docker images, and scripts required for\ntesting are all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02422v1","updated":"2023-09-05T17:51:00Z","published":"2023-09-05T17:51:00Z","title":"Maximum Mean Discrepancy Meets Neural Networks: The\n Radon-Kolmogorov-Smirnov Test","summary":" Maximum mean discrepancy (MMD) refers to a general class of nonparametric\ntwo-sample tests that are based on maximizing the mean difference over samples\nfrom one distribution $P$ versus another $Q$, over all choices of data\ntransformations $f$ living in some function space $\\mathcal{F}$. Inspired by\nrecent work that connects what are known as functions of $\\textit{Radon bounded\nvariation}$ (RBV) and neural networks (Parhi and Nowak, 2021, 2023), we study\nthe MMD defined by taking $\\mathcal{F}$ to be the unit ball in the RBV space of\na given smoothness order $k \\geq 0$. This test, which we refer to as the\n$\\textit{Radon-Kolmogorov-Smirnov}$ (RKS) test, can be viewed as a\ngeneralization of the well-known and classical Kolmogorov-Smirnov (KS) test to\nmultiple dimensions and higher orders of smoothness. It is also intimately\nconnected to neural networks: we prove that the witness in the RKS test -- the\nfunction $f$ achieving the maximum mean difference -- is always a ridge spline\nof degree $k$, i.e., a single neuron in a neural network. This allows us to\nleverage the power of modern deep learning toolkits to (approximately) optimize\nthe criterion that underlies the RKS test. We prove that the RKS test has\nasymptotically full power at distinguishing any distinct pair $P \\not= Q$ of\ndistributions, derive its asymptotic null distribution, and carry out extensive\nexperiments to elucidate the strengths and weakenesses of the RKS test versus\nthe more traditional kernel MMD test.\n","authors":["Seunghoon Paik","Michael Celentano","Alden Green","Ryan J. Tibshirani"],"pdf_url":"https://arxiv.org/pdf/2309.02422v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02417v1","updated":"2023-09-05T17:48:09Z","published":"2023-09-05T17:48:09Z","title":"Computing SHAP Efficiently Using Model Structure Information","summary":" SHAP (SHapley Additive exPlanations) has become a popular method to attribute\nthe prediction of a machine learning model on an input to its features. One\nmain challenge of SHAP is the computation time. An exact computation of Shapley\nvalues requires exponential time complexity. Therefore, many approximation\nmethods are proposed in the literature. In this paper, we propose methods that\ncan compute SHAP exactly in polynomial time or even faster for SHAP definitions\nthat satisfy our additivity and dummy assumptions (eg, kernal SHAP and baseline\nSHAP). We develop different strategies for models with different levels of\nmodel structure information: known functional decomposition, known order of\nmodel (defined as highest order of interaction in the model), or unknown order.\nFor the first case, we demonstrate an additive property and a way to compute\nSHAP from the lower-order functional components. For the second case, we derive\nformulas that can compute SHAP in polynomial time. Both methods yield exact\nSHAP results. Finally, if even the order of model is unknown, we propose an\niterative way to approximate Shapley values. The three methods we propose are\ncomputationally efficient when the order of model is not high which is\ntypically the case in practice. We compare with sampling approach proposed in\nCastor & Gomez (2008) using simulation studies to demonstrate the efficacy of\nour proposed methods.\n","authors":["Linwei Hu","Ke Wang"],"pdf_url":"https://arxiv.org/pdf/2309.02417v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2309.02412v1","updated":"2023-09-05T17:40:54Z","published":"2023-09-05T17:40:54Z","title":"First and zeroth-order implementations of the regularized Newton method\n with lazy approximated Hessians","summary":" In this work, we develop first-order (Hessian-free) and zero-order\n(derivative-free) implementations of the Cubically regularized Newton method\nfor solving general non-convex optimization problems. For that, we employ\nfinite difference approximations of the derivatives. We use a special adaptive\nsearch procedure in our algorithms, which simultaneously fits both the\nregularization constant and the parameters of the finite difference\napproximations. It makes our schemes free from the need to know the actual\nLipschitz constants. Additionally, we equip our algorithms with the lazy\nHessian update that reuse a previously computed Hessian approximation matrix\nfor several iterations. Specifically, we prove the global complexity bound of\n$\\mathcal{O}( n^{1/2} \\epsilon^{-3/2})$ function and gradient evaluations for\nour new Hessian-free method, and a bound of $\\mathcal{O}( n^{3/2}\n\\epsilon^{-3/2} )$ function evaluations for the derivative-free method, where\n$n$ is the dimension of the problem and $\\epsilon$ is the desired accuracy for\nthe gradient norm. These complexity bounds significantly improve the previously\nknown ones in terms of the joint dependence on $n$ and $\\epsilon$, for the\nfirst-order and zeroth-order non-convex optimization.\n","authors":["Nikita Doikov","Geovani Nunes Grapiglia"],"pdf_url":"https://arxiv.org/pdf/2309.02412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02411v1","updated":"2023-09-05T17:40:34Z","published":"2023-09-05T17:40:34Z","title":"Delta-LoRA: Fine-Tuning High-Rank Parameters with the Delta of Low-Rank\n Matrices","summary":" In this paper, we present Delta-LoRA, which is a novel parameter-efficient\napproach to fine-tune large language models (LLMs). In contrast to LoRA and\nother low-rank adaptation methods such as AdaLoRA, Delta-LoRA not only updates\nthe low-rank matrices $\\bA$ and $\\bB$, but also propagate the learning to the\npre-trained weights $\\bW$ via updates utilizing the delta of the product of two\nlow-rank matrices ($\\bA^{(t+1)}\\bB^{(t+1)} - \\bA^{(t)}\\bB^{(t)}$). Such a\nstrategy effectively addresses the limitation that the incremental update of\nlow-rank matrices is inadequate for learning representations capable for\ndownstream tasks. Moreover, as the update of $\\bW$ does not need to compute the\ngradients of $\\bW$ and store their momentums, Delta-LoRA shares comparable\nmemory requirements and computational costs with LoRA. Extensive experiments\nshow that Delta-LoRA significantly outperforms existing low-rank adaptation\nmethods. We further support these results with comprehensive analyses that\nunderscore the effectiveness of Delta-LoRA.\n","authors":["Bojia Zi","Xianbiao Qi","Lingzhi Wang","Jianan Wang","Kam-Fai Wong","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.02411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.12100v3","updated":"2023-09-05T17:14:01Z","published":"2022-06-24T06:20:37Z","title":"zPROBE: Zero Peek Robustness Checks for Federated Learning","summary":" Privacy-preserving federated learning allows multiple users to jointly train\na model with coordination of a central server. The server only learns the final\naggregation result, thus the users' (private) training data is not leaked from\nthe individual model updates. However, keeping the individual updates private\nallows malicious users to perform Byzantine attacks and degrade the accuracy\nwithout being detected. Best existing defenses against Byzantine workers rely\non robust rank-based statistics, e.g., median, to find malicious updates.\nHowever, implementing privacy-preserving rank-based statistics is nontrivial\nand not scalable in the secure domain, as it requires sorting all individual\nupdates. We establish the first private robustness check that uses high break\npoint rank-based statistics on aggregated model updates. By exploiting\nrandomized clustering, we significantly improve the scalability of our defense\nwithout compromising privacy. We leverage our statistical bounds in\nzero-knowledge proofs to detect and remove malicious updates without revealing\nthe private user updates. Our novel framework, zPROBE, enables Byzantine\nresilient and secure federated learning. Empirical evaluations demonstrate that\nzPROBE provides a low overhead solution to defend against state-of-the-art\nByzantine attacks while preserving privacy.\n","authors":["Zahra Ghodsi","Mojan Javaheripi","Nojan Sheybani","Xinqiao Zhang","Ke Huang","Farinaz Koushanfar"],"pdf_url":"https://arxiv.org/pdf/2206.12100v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02393v1","updated":"2023-09-05T17:04:09Z","published":"2023-09-05T17:04:09Z","title":"In-Ear-Voice: Towards Milli-Watt Audio Enhancement With Bone-Conduction\n Microphones for In-Ear Sensing Platforms","summary":" The recent ubiquitous adoption of remote conferencing has been accompanied by\nomnipresent frustration with distorted or otherwise unclear voice\ncommunication. Audio enhancement can compensate for low-quality input signals\nfrom, for example, small true wireless earbuds, by applying noise suppression\ntechniques. Such processing relies on voice activity detection (VAD) with low\nlatency and the added capability of discriminating the wearer's voice from\nothers - a task of significant computational complexity. The tight energy\nbudget of devices as small as modern earphones, however, requires any system\nattempting to tackle this problem to do so with minimal power and processing\noverhead, while not relying on speaker-specific voice samples and training due\nto usability concerns.\n This paper presents the design and implementation of a custom research\nplatform for low-power wireless earbuds based on novel, commercial, MEMS\nbone-conduction microphones. Such microphones can record the wearer's speech\nwith much greater isolation, enabling personalized voice activity detection and\nfurther audio enhancement applications. Furthermore, the paper accurately\nevaluates a proposed low-power personalized speech detection algorithm based on\nbone conduction data and a recurrent neural network running on the implemented\nresearch platform. This algorithm is compared to an approach based on\ntraditional microphone input. The performance of the bone conduction system,\nachieving detection of speech within 12.8ms at an accuracy of 95\\% is\nevaluated. Different SoC choices are contrasted, with the final implementation\nbased on the cutting-edge Ambiq Apollo 4 Blue SoC achieving 2.64mW average\npower consumption at 14uJ per inference, reaching 43h of battery life on a\nminiature 32mAh li-ion cell and without duty cycling.\n","authors":["Philipp Schilk","Niccolò Polvani","Andrea Ronco","Milos Cernak","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2309.02393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02390v1","updated":"2023-09-05T17:00:24Z","published":"2023-09-05T17:00:24Z","title":"Explaining grokking through circuit efficiency","summary":" One of the most surprising puzzles in neural network generalisation is\ngrokking: a network with perfect training accuracy but poor generalisation\nwill, upon further training, transition to perfect generalisation. We propose\nthat grokking occurs when the task admits a generalising solution and a\nmemorising solution, where the generalising solution is slower to learn but\nmore efficient, producing larger logits with the same parameter norm. We\nhypothesise that memorising circuits become more inefficient with larger\ntraining datasets while generalising circuits do not, suggesting there is a\ncritical dataset size at which memorisation and generalisation are equally\nefficient. We make and confirm four novel predictions about grokking, providing\nsignificant evidence in favour of our explanation. Most strikingly, we\ndemonstrate two novel and surprising behaviours: ungrokking, in which a network\nregresses from perfect to low test accuracy, and semi-grokking, in which a\nnetwork shows delayed generalisation to partial rather than perfect test\naccuracy.\n","authors":["Vikrant Varma","Rohin Shah","Zachary Kenton","János Kramár","Ramana Kumar"],"pdf_url":"https://arxiv.org/pdf/2309.02390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15605v2","updated":"2023-09-05T16:49:18Z","published":"2023-08-29T19:54:37Z","title":"Benchmarks for Detecting Measurement Tampering","summary":" When training powerful AI systems to perform complex tasks, it may be\nchallenging to provide training signals which are robust to optimization. One\nconcern is \\textit{measurement tampering}, where the AI system manipulates\nmultiple measurements to create the illusion of good results instead of\nachieving the desired outcome. In this work, we build four new text-based\ndatasets to evaluate measurement tampering detection techniques on large\nlanguage models. Concretely, given sets of text inputs and measurements aimed\nat determining if some outcome occurred, as well as a base model able to\naccurately predict measurements, the goal is to determine if examples where all\nmeasurements indicate the outcome occurred actually had the outcome occur, or\nif this was caused by measurement tampering. We demonstrate techniques that\noutperform simple baselines on most datasets, but don't achieve maximum\nperformance. We believe there is significant room for improvement for both\ntechniques and datasets, and we are excited for future work tackling\nmeasurement tampering.\n","authors":["Fabien Roger","Ryan Greenblatt","Max Nadeau","Buck Shlegeris","Nate Thomas"],"pdf_url":"https://arxiv.org/pdf/2308.15605v2.pdf","comment":"Edit: extended and improved appendices"},{"id":"http://arxiv.org/abs/2305.10623v2","updated":"2023-09-05T16:34:10Z","published":"2023-05-18T00:21:04Z","title":"The star-shaped space of solutions of the spherical negative perceptron","summary":" Empirical studies on the landscape of neural networks have shown that\nlow-energy configurations are often found in complex connected structures,\nwhere zero-energy paths between pairs of distant solutions can be constructed.\nHere we consider the spherical negative perceptron, a prototypical non-convex\nneural network model framed as a continuous constraint satisfaction problem. We\nintroduce a general analytical method for computing energy barriers in the\nsimplex with vertex configurations sampled from the equilibrium. We find that\nin the over-parameterized regime the solution manifold displays simple\nconnectivity properties. There exists a large geodesically convex component\nthat is attractive for a wide range of optimization dynamics. Inside this\nregion we identify a subset of atypical high-margin solutions that are\ngeodesically connected with most other solutions, giving rise to a star-shaped\ngeometry. We analytically characterize the organization of the connected space\nof solutions and show numerical evidence of a transition, at larger constraint\ndensities, where the aforementioned simple geodesic connectivity breaks down.\n","authors":["Brandon Livio Annesi","Clarissa Lauditi","Carlo Lucibello","Enrico M. Malatesta","Gabriele Perugini","Fabrizio Pittorino","Luca Saglietti"],"pdf_url":"https://arxiv.org/pdf/2305.10623v2.pdf","comment":"27 pages, 16 figures, comments are welcome"},{"id":"http://arxiv.org/abs/2208.07737v3","updated":"2023-09-05T16:22:41Z","published":"2022-08-16T13:12:59Z","title":"Learning Efficient Abstract Planning Models that Choose What to Predict","summary":" An effective approach to solving long-horizon tasks in robotics domains with\ncontinuous state and action spaces is bilevel planning, wherein a high-level\nsearch over an abstraction of an environment is used to guide low-level\ndecision-making. Recent work has shown how to enable such bilevel planning by\nlearning abstract models in the form of symbolic operators and neural samplers.\nIn this work, we show that existing symbolic operator learning approaches fall\nshort in many robotics domains where a robot's actions tend to cause a large\nnumber of irrelevant changes in the abstract state. This is primarily because\nthey attempt to learn operators that exactly predict all observed changes in\nthe abstract state. To overcome this issue, we propose to learn operators that\n'choose what to predict' by only modelling changes necessary for abstract\nplanning to achieve specified goals. Experimentally, we show that our approach\nlearns operators that lead to efficient planning across 10 different hybrid\nrobotics domains, including 4 from the challenging BEHAVIOR-100 benchmark,\nwhile generalizing to novel initial states, goals, and objects.\n","authors":["Nishanth Kumar","Willie McClinton","Rohan Chitnis","Tom Silver","Tomás Lozano-Pérez","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2208.07737v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04613v3","updated":"2023-09-05T16:22:27Z","published":"2023-03-08T14:32:59Z","title":"The Descriptive Complexity of Graph Neural Networks","summary":" We analyse the power of graph neural networks (GNNs) in terms of Boolean\ncircuit complexity and descriptive complexity.\n We prove that the graph queries that can be computed by a polynomial-size\nbounded-depth family of GNNs are exactly those definable in the guarded\nfragment GFO+C of first-order logic with counting and with built-in relations.\nThis puts GNNs in the circuit complexity class TC^0. Remarkably, the GNN\nfamilies may use arbitrary real weights and a wide class of activation\nfunctions that includes the standard ReLU, logistic \"sigmod\", and hyperbolic\ntangent functions. If the GNNs are allowed to use random initialisation and\nglobal readout (both standard features of GNNs widely used in practice), they\ncan compute exactly the same queries as bounded depth Boolean circuits with\nthreshold gates, that is, exactly the queries in TC^0.\n Moreover, we show that queries computable by a single GNN with piecewise\nlinear activations and rational weights are definable in GFO+C without built-in\nrelations. Therefore, they are contained in uniform TC^0.\n","authors":["Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2303.04613v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05389v2","updated":"2023-09-05T16:14:56Z","published":"2023-05-06T14:40:20Z","title":"Two to Five Truths in Non-Negative Matrix Factorization","summary":" In this paper, we explore the role of matrix scaling on a matrix of counts\nwhen building a topic model using non-negative matrix factorization. We present\na scaling inspired by the normalized Laplacian (NL) for graphs that can greatly\nimprove the quality of a non-negative matrix factorization. The results\nparallel those in the spectral graph clustering work of \\cite{Priebe:2019},\nwhere the authors proved adjacency spectral embedding (ASE) spectral clustering\nwas more likely to discover core-periphery partitions and Laplacian Spectral\nEmbedding (LSE) was more likely to discover affinity partitions. In text\nanalysis non-negative matrix factorization (NMF) is typically used on a matrix\nof co-occurrence ``contexts'' and ``terms\" counts. The matrix scaling inspired\nby LSE gives significant improvement for text topic models in a variety of\ndatasets. We illustrate the dramatic difference a matrix scalings in NMF can\ngreatly improve the quality of a topic model on three datasets where human\nannotation is available. Using the adjusted Rand index (ARI), a measure cluster\nsimilarity we see an increase of 50\\% for Twitter data and over 200\\% for a\nnewsgroup dataset versus using counts, which is the analogue of ASE. For clean\ndata, such as those from the Document Understanding Conference, NL gives over\n40\\% improvement over ASE. We conclude with some analysis of this phenomenon\nand some connections of this scaling with other matrix scaling methods.\n","authors":["John M. Conroy","Neil P Molino","Brian Baughman","Rod Gomez","Ryan Kaliszewski","Nicholas A. Lines"],"pdf_url":"https://arxiv.org/pdf/2305.05389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02354v1","updated":"2023-09-05T16:11:37Z","published":"2023-09-05T16:11:37Z","title":"A Lightweight and Transferable Design for Robust LEGO Manipulation","summary":" LEGO is a well-known platform for prototyping pixelized objects. However,\nrobotic LEGO prototyping (i.e. manipulating LEGO bricks) is challenging due to\nthe tight connections and accuracy requirement. This paper investigates safe\nand efficient robotic LEGO manipulation. In particular, this paper reduces the\ncomplexity of the manipulation by hardware-software co-design. An end-of-arm\ntool (EOAT) is designed, which reduces the problem dimension and allows large\nindustrial robots to easily manipulate LEGO bricks. In addition, this paper\nuses evolution strategy to safely optimize the robot motion for LEGO\nmanipulation. Experiments demonstrate that the EOAT performs reliably in\nmanipulating LEGO bricks and the learning framework can effectively and safely\nimprove the manipulation performance to a 100\\% success rate. The co-design is\ndeployed to multiple robots (i.e. FANUC LR-mate 200id/7L and Yaskawa GP4) to\ndemonstrate its generalizability and transferability. In the end, we show that\nthe proposed solution enables sustainable robotic LEGO prototyping, in which\nthe robot can repeatedly assemble and disassemble different prototypes.\n","authors":["Ruixuan Liu","Yifan Sun","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02351v1","updated":"2023-09-05T16:07:00Z","published":"2023-09-05T16:07:00Z","title":"Exact Inference for Continuous-Time Gaussian Process Dynamics","summary":" Physical systems can often be described via a continuous-time dynamical\nsystem. In practice, the true system is often unknown and has to be learned\nfrom measurement data. Since data is typically collected in discrete time, e.g.\nby sensors, most methods in Gaussian process (GP) dynamics model learning are\ntrained on one-step ahead predictions. This can become problematic in several\nscenarios, e.g. if measurements are provided at irregularly-sampled time steps\nor physical system properties have to be conserved. Thus, we aim for a GP model\nof the true continuous-time dynamics. Higher-order numerical integrators\nprovide the necessary tools to address this problem by discretizing the\ndynamics function with arbitrary accuracy. Many higher-order integrators\nrequire dynamics evaluations at intermediate time steps making exact GP\ninference intractable. In previous work, this problem is often tackled by\napproximating the GP posterior with variational inference. However, exact GP\ninference is preferable in many scenarios, e.g. due to its mathematical\nguarantees. In order to make direct inference tractable, we propose to leverage\nmultistep and Taylor integrators. We demonstrate how to derive flexible\ninference schemes for these types of integrators. Further, we derive tailored\nsampling schemes that allow to draw consistent dynamics functions from the\nlearned posterior. This is crucial to sample consistent predictions from the\ndynamics model. We demonstrate empirically and theoretically that our approach\nyields an accurate representation of the continuous-time system.\n","authors":["Katharina Ensinger","Nicholas Tagliapietra","Sebastian Ziesche","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2309.02351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11368v2","updated":"2023-09-05T16:04:17Z","published":"2023-01-26T19:25:18Z","title":"Coincident Learning for Unsupervised Anomaly Detection","summary":" Anomaly detection is an important task for complex systems (e.g., industrial\nfacilities, manufacturing, large-scale science experiments), where failures in\na sub-system can lead to low yield, faulty products, or even damage to\ncomponents. While complex systems often have a wealth of data, labeled\nanomalies are typically rare (or even nonexistent) and expensive to acquire.\nUnsupervised approaches are therefore common and typically search for anomalies\neither by distance or density of examples in the input feature space (or some\nassociated low-dimensional representation). This paper presents a novel\napproach called CoAD, which is specifically designed for multi-modal tasks and\nidentifies anomalies based on \\textit{coincident} behavior across two different\nslices of the feature space. We define an \\textit{unsupervised} metric,\n$\\hat{F}_\\beta$, out of analogy to the supervised classification $F_\\beta$\nstatistic. CoAD uses $\\hat{F}_\\beta$ to train an anomaly detection algorithm on\n\\textit{unlabeled data}, based on the expectation that anomalous behavior in\none feature slice is coincident with anomalous behavior in the other. The\nmethod is illustrated using a synthetic outlier data set and a MNIST-based\nimage data set, and is compared to prior state-of-the-art on two real-world\ntasks: a metal milling data set and a data set from a particle accelerator.\n","authors":["Ryan Humble","Zhe Zhang","Finn O'Shea","Eric Darve","Daniel Ratner"],"pdf_url":"https://arxiv.org/pdf/2301.11368v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18798v2","updated":"2023-09-05T16:03:17Z","published":"2023-05-30T07:19:25Z","title":"AnoOnly: Semi-Supervised Anomaly Detection without Loss on Normal Data","summary":" Semi-supervised anomaly detection (SSAD) methods have demonstrated their\neffectiveness in enhancing unsupervised anomaly detection (UAD) by leveraging\nfew-shot but instructive abnormal instances. However, the dominance of\nhomogeneous normal data over anomalies biases the SSAD models against\neffectively perceiving anomalies. To address this issue and achieve balanced\nsupervision between heavily imbalanced normal and abnormal data, we develop a\nnovel framework called AnoOnly (Anomaly Only). Unlike existing SSAD methods\nthat resort to strict loss supervision, AnoOnly suspends it and introduces a\nform of weak supervision for normal data. This weak supervision is instantiated\nthrough the utilization of batch normalization, which implicitly performs\ncluster learning on normal data. When integrated into existing SSAD methods,\nthe proposed AnoOnly demonstrates remarkable performance enhancements across\nvarious models and datasets, achieving new state-of-the-art performance.\nAdditionally, our AnoOnly is natively robust to label noise when suffering from\ndata contamination. Our code is publicly available at\nhttps://github.com/cool-xuan/AnoOnly.\n","authors":["Yixuan Zhou","Peiyu Yang","Yi Qu","Xing Xu","Zhe Sun","Andrzej Cichocki"],"pdf_url":"https://arxiv.org/pdf/2305.18798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02334v1","updated":"2023-09-05T15:54:09Z","published":"2023-09-05T15:54:09Z","title":"PolyLUT: Learning Piecewise Polynomials for Ultra-Low Latency FPGA\n LUT-based Inference","summary":" Field-programmable gate arrays (FPGAs) are widely used to implement deep\nlearning inference. Standard deep neural network inference involves the\ncomputation of interleaved linear maps and nonlinear activation functions.\nPrior work for ultra-low latency implementations has hardcoded the combination\nof linear maps and nonlinear activations inside FPGA lookup tables (LUTs). Our\nwork is motivated by the idea that the LUTs in an FPGA can be used to implement\na much greater variety of functions than this. In this paper, we propose a\nnovel approach to training neural networks for FPGA deployment using\nmultivariate polynomials as the basic building block. Our method takes\nadvantage of the flexibility offered by the soft logic, hiding the polynomial\nevaluation inside the LUTs with zero overhead. We show that by using polynomial\nbuilding blocks, we can achieve the same accuracy using considerably fewer\nlayers of soft logic than by using linear functions, leading to significant\nlatency and area improvements. We demonstrate the effectiveness of this\napproach in three tasks: network intrusion detection, jet identification at the\nCERN Large Hadron Collider, and handwritten digit recognition using the MNIST\ndataset.\n","authors":["Marta Andronic","George A. Constantinides"],"pdf_url":"https://arxiv.org/pdf/2309.02334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02333v1","updated":"2023-09-05T15:53:41Z","published":"2023-09-05T15:53:41Z","title":"Resilient VAE: Unsupervised Anomaly Detection at the SLAC Linac Coherent\n Light Source","summary":" Significant advances in utilizing deep learning for anomaly detection have\nbeen made in recent years. However, these methods largely assume the existence\nof a normal training set (i.e., uncontaminated by anomalies) or even a\ncompletely labeled training set. In many complex engineering systems, such as\nparticle accelerators, labels are sparse and expensive; in order to perform\nanomaly detection in these cases, we must drop these assumptions and utilize a\ncompletely unsupervised method. This paper introduces the Resilient Variational\nAutoencoder (ResVAE), a deep generative model specifically designed for anomaly\ndetection. ResVAE exhibits resilience to anomalies present in the training data\nand provides feature-level anomaly attribution. During the training process,\nResVAE learns the anomaly probability for each sample as well as each\nindividual feature, utilizing these probabilities to effectively disregard\nanomalous examples in the training data. We apply our proposed method to detect\nanomalies in the accelerator status at the SLAC Linac Coherent Light Source\n(LCLS). By utilizing shot-to-shot data from the beam position monitoring\nsystem, we demonstrate the exceptional capability of ResVAE in identifying\nvarious types of anomalies that are visible in the accelerator.\n","authors":["Ryan Humble","William Colocho","Finn O'Shea","Daniel Ratner","Eric Darve"],"pdf_url":"https://arxiv.org/pdf/2309.02333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02320v1","updated":"2023-09-05T15:40:13Z","published":"2023-09-05T15:40:13Z","title":"SeisCLIP: A seismology foundation model pre-trained by multi-modal data\n for multi-purpose seismic feature extraction","summary":" Training specific deep learning models for particular tasks is common across\nvarious domains within seismology. However, this approach encounters two\nlimitations: inadequate labeled data for certain tasks and limited\ngeneralization across regions. To address these challenges, we develop\nSeisCLIP, a seismology foundation model trained through contrastive learning\nfrom multi-modal data. It consists of a transformer encoder for extracting\ncrucial features from time-frequency seismic spectrum and an MLP encoder for\nintegrating the phase and source information of the same event. These encoders\nare jointly pre-trained on a vast dataset and the spectrum encoder is\nsubsequently fine-tuned on smaller datasets for various downstream tasks.\nNotably, SeisCLIP's performance surpasses that of baseline methods in event\nclassification, localization, and focal mechanism analysis tasks, employing\ndistinct datasets from different regions. In conclusion, SeisCLIP holds\nsignificant potential as a foundational model in the field of seismology,\npaving the way for innovative directions in foundation-model-based seismology\nresearch.\n","authors":["Xu Si","Xinming Wu","Hanlin Sheng","Jun Zhu","Zefeng Li"],"pdf_url":"https://arxiv.org/pdf/2309.02320v1.pdf","comment":"27 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2309.02317v1","updated":"2023-09-05T15:34:22Z","published":"2023-09-05T15:34:22Z","title":"A study on the impact of pre-trained model on Just-In-Time defect\n prediction","summary":" Previous researchers conducting Just-In-Time (JIT) defect prediction tasks\nhave primarily focused on the performance of individual pre-trained models,\nwithout exploring the relationship between different pre-trained models as\nbackbones. In this study, we build six models: RoBERTaJIT, CodeBERTJIT,\nBARTJIT, PLBARTJIT, GPT2JIT, and CodeGPTJIT, each with a distinct pre-trained\nmodel as its backbone. We systematically explore the differences and\nconnections between these models. Specifically, we investigate the performance\nof the models when using Commit code and Commit message as inputs, as well as\nthe relationship between training efficiency and model distribution among these\nsix models. Additionally, we conduct an ablation experiment to explore the\nsensitivity of each model to inputs. Furthermore, we investigate how the models\nperform in zero-shot and few-shot scenarios. Our findings indicate that each\nmodel based on different backbones shows improvements, and when the backbone's\npre-training model is similar, the training resources that need to be consumed\nare much more closer. We also observe that Commit code plays a significant role\nin defect detection, and different pre-trained models demonstrate better defect\ndetection ability with a balanced dataset under few-shot scenarios. These\nresults provide new insights for optimizing JIT defect prediction tasks using\npre-trained models and highlight the factors that require more attention when\nconstructing such models. Additionally, CodeGPTJIT and GPT2JIT achieved better\nperformance than DeepJIT and CC2Vec on the two datasets respectively under 2000\ntraining samples. These findings emphasize the effectiveness of\ntransformer-based pre-trained models in JIT defect prediction tasks, especially\nin scenarios with limited training data.\n","authors":["Yuxiang Guo","Xiaopeng Gao","Zhenyu Zhang","W. K. Chan","Bo Jiang"],"pdf_url":"https://arxiv.org/pdf/2309.02317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02304v1","updated":"2023-09-05T15:13:48Z","published":"2023-09-05T15:13:48Z","title":"Graph Self-Contrast Representation Learning","summary":" Graph contrastive learning (GCL) has recently emerged as a promising approach\nfor graph representation learning. Some existing methods adopt the 1-vs-K\nscheme to construct one positive and K negative samples for each graph, but it\nis difficult to set K. For those methods that do not use negative samples, it\nis often necessary to add additional strategies to avoid model collapse, which\ncould only alleviate the problem to some extent. All these drawbacks will\nundoubtedly have an adverse impact on the generalizability and efficiency of\nthe model. In this paper, to address these issues, we propose a novel graph\nself-contrast framework GraphSC, which only uses one positive and one negative\nsample, and chooses triplet loss as the objective. Specifically, self-contrast\nhas two implications. First, GraphSC generates both positive and negative views\nof a graph sample from the graph itself via graph augmentation functions of\nvarious intensities, and use them for self-contrast. Second, GraphSC uses\nHilbert-Schmidt Independence Criterion (HSIC) to factorize the representations\ninto multiple factors and proposes a masked self-contrast mechanism to better\nseparate positive and negative samples. Further, Since the triplet loss only\noptimizes the relative distance between the anchor and its positive/negative\nsamples, it is difficult to ensure the absolute distance between the anchor and\npositive sample. Therefore, we explicitly reduced the absolute distance between\nthe anchor and positive sample to accelerate convergence. Finally, we conduct\nextensive experiments to evaluate the performance of GraphSC against 19 other\nstate-of-the-art methods in both unsupervised and transfer learning settings.\n","authors":["Minjie Chen","Yao Cheng","Ye Wang","Xiang Li","Ming Gao"],"pdf_url":"https://arxiv.org/pdf/2309.02304v1.pdf","comment":"ICDM 2023(Regular)"},{"id":"http://arxiv.org/abs/2302.01222v5","updated":"2023-09-05T15:02:04Z","published":"2023-02-02T17:03:08Z","title":"A novel automatic wind power prediction framework based on multi-time\n scale and temporal attention mechanisms","summary":" Wind energy is a widely distributed, renewable, and environmentally friendly\nenergy source that plays a crucial role in mitigating global warming and\naddressing energy shortages. Nevertheless, wind power generation is\ncharacterized by volatility, intermittence, and randomness, which hinder its\nability to serve as a reliable power source for the grid. Accurate wind power\nforecasting is crucial for developing a new power system that heavily relies on\nrenewable energy sources. However, traditional wind power forecasting systems\nprimarily focus on ultra-short-term or short-term forecasts, limiting their\nability to address the diverse adjustment requirements of the power system\nsimultaneously. To overcome these challenges, We propose an automatic framework\ncapable of forecasting wind power across multi-time scale. The framework based\non the tree-structured Parzen estimator (TPE) and temporal fusion transformer\n(TFT) that can provide ultra-short-term, short-term and medium-term wind power\nforecasting power.Our approach employs the TFT for wind power forecasting and\ncategorizes features based on their properties. Additionally, we introduce a\ngeneric algorithm to simultaneously fine-tune the hyperparameters of the\ndecomposition method and model. We evaluate the performance of our framework by\nconducting ablation experiments using three commonly used decomposition\nalgorithms and six state-of-the-art models for forecasting multi-time scale.\nThe experimental results demonstrate that our proposed method considerably\nimproves prediction accuracy on the public dataset Engie\nhttps://opendata-renewables.engie.com. Compared to the second-best\nstate-of-the-art model, our approach exhibits a reduction of 31.75% and 28.74%\nin normalized mean absolute error (nMAE) for 24-hour forecasting, and 20.79%\nand 16.93% in nMAE for 48-hour forecasting, respectively.\n","authors":["Meiyu Jiang","Jun Shen","Xuetao Jiang","Lihui Luo","Rui Zhou","Qingguo Zhou"],"pdf_url":"https://arxiv.org/pdf/2302.01222v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14659v2","updated":"2023-09-05T15:00:23Z","published":"2023-08-28T15:41:30Z","title":"RESTORE: Graph Embedding Assessment Through Reconstruction","summary":" Following the success of Word2Vec embeddings, graph embeddings (GEs) have\ngained substantial traction. GEs are commonly generated and evaluated\nextrinsically on downstream applications, but intrinsic evaluations of the\noriginal graph properties in terms of topological structure and semantic\ninformation have been lacking. Understanding these will help identify the\ndeficiency of the various families of GE methods when vectorizing graphs in\nterms of preserving the relevant knowledge or learning incorrect knowledge. To\naddress this, we propose RESTORE, a framework for intrinsic GEs assessment\nthrough graph reconstruction. We show that reconstructing the original graph\nfrom the underlying GEs yields insights into the relative amount of information\npreserved in a given vector form. We first introduce the graph reconstruction\ntask. We generate GEs from three GE families based on factorization methods,\nrandom walks, and deep learning (with representative algorithms from each\nfamily) on the CommonSense Knowledge Graph (CSKG). We analyze their\neffectiveness in preserving the (a) topological structure of node-level graph\nreconstruction with an increasing number of hops and (b) semantic information\non various word semantic and analogy tests. Our evaluations show deep\nlearning-based GE algorithm (SDNE) is overall better at preserving (a) with a\nmean average precision (mAP) of 0.54 and 0.35 for 2 and 3-hop reconstruction\nrespectively, while the factorization-based algorithm (HOPE) is better at\nencapsulating (b) with an average Euclidean distance of 0.14, 0.17, and 0.11\nfor 1, 2, and 3-hop reconstruction respectively. The modest performance of\nthese GEs leaves room for further research avenues on better graph\nrepresentation learning.\n","authors":["Hong Yung Yip","Chidaksh Ravuru","Neelabha Banerjee","Shashwat Jha","Amit Sheth","Aman Chadha","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2308.14659v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02292v1","updated":"2023-09-05T14:55:09Z","published":"2023-09-05T14:55:09Z","title":"Inferring effective couplings with Restricted Boltzmann Machines","summary":" Generative models offer a direct way to model complex data. Among them,\nenergy-based models provide us with a neural network model that aims to\naccurately reproduce all statistical correlations observed in the data at the\nlevel of the Boltzmann weight of the model. However, one challenge is to\nunderstand the physical interpretation of such models. In this study, we\npropose a simple solution by implementing a direct mapping between the energy\nfunction of the Restricted Boltzmann Machine and an effective Ising spin\nHamiltonian that includes high-order interactions between spins. This mapping\nincludes interactions of all possible orders, going beyond the conventional\npairwise interactions typically considered in the inverse Ising approach, and\nallowing the description of complex datasets. Earlier work attempted to achieve\nthis goal, but the proposed mappings did not do properly treat the complexity\nof the problem or did not contain direct prescriptions for practical\napplication. To validate our method, we perform several controlled numerical\nexperiments where the training samples are equilibrium samples of predefined\nmodels containing local external fields, two-body and three-body interactions\nin various low-dimensional topologies. The results demonstrate the\neffectiveness of our proposed approach in learning the correct interaction\nnetwork and pave the way for its application in modeling interesting datasets.\nWe also evaluate the quality of the inferred model based on different training\nmethods.\n","authors":["Aurélien Decelle","Cyril Furtlehner","Alfonso De Jesus Navas Gómez","Beatriz Seoane"],"pdf_url":"https://arxiv.org/pdf/2309.02292v1.pdf","comment":"16 figures, 22 pages"},{"id":"http://arxiv.org/abs/2308.16609v2","updated":"2023-09-05T14:46:38Z","published":"2023-08-31T10:12:32Z","title":"Towards Long-Tailed Recognition for Graph Classification via\n Collaborative Experts","summary":" Graph classification, aiming at learning the graph-level representations for\neffective class assignments, has received outstanding achievements, which\nheavily relies on high-quality datasets that have balanced class distribution.\nIn fact, most real-world graph data naturally presents a long-tailed form,\nwhere the head classes occupy much more samples than the tail classes, it thus\nis essential to study the graph-level classification over long-tailed data\nwhile still remaining largely unexplored. However, most existing long-tailed\nlearning methods in visions fail to jointly optimize the representation\nlearning and classifier training, as well as neglect the mining of the\nhard-to-classify classes. Directly applying existing methods to graphs may lead\nto sub-optimal performance, since the model trained on graphs would be more\nsensitive to the long-tailed distribution due to the complex topological\ncharacteristics. Hence, in this paper, we propose a novel long-tailed\ngraph-level classification framework via Collaborative Multi-expert Learning\n(CoMe) to tackle the problem. To equilibrate the contributions of head and tail\nclasses, we first develop balanced contrastive learning from the view of\nrepresentation learning, and then design an individual-expert classifier\ntraining based on hard class mining. In addition, we execute gated fusion and\ndisentangled knowledge distillation among the multiple experts to promote the\ncollaboration in a multi-expert framework. Comprehensive experiments are\nperformed on seven widely-used benchmark datasets to demonstrate the\nsuperiority of our method CoMe over state-of-the-art baselines.\n","authors":["Siyu Yi","Zhengyang Mao","Wei Ju","Yongdao Zhou","Luchen Liu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16609v2.pdf","comment":"Accepted by IEEE Transactions on Big Data (TBD 2024)"},{"id":"http://arxiv.org/abs/2309.02286v1","updated":"2023-09-05T14:45:54Z","published":"2023-09-05T14:45:54Z","title":"Haystack: A Panoptic Scene Graph Dataset to Evaluate Rare Predicate\n Classes","summary":" Current scene graph datasets suffer from strong long-tail distributions of\ntheir predicate classes. Due to a very low number of some predicate classes in\nthe test sets, no reliable metrics can be retrieved for the rarest classes. We\nconstruct a new panoptic scene graph dataset and a set of metrics that are\ndesigned as a benchmark for the predictive performance especially on rare\npredicate classes. To construct the new dataset, we propose a model-assisted\nannotation pipeline that efficiently finds rare predicate classes that are\nhidden in a large set of images like needles in a haystack.\n Contrary to prior scene graph datasets, Haystack contains explicit negative\nannotations, i.e. annotations that a given relation does not have a certain\npredicate class. Negative annotations are helpful especially in the field of\nscene graph generation and open up a whole new set of possibilities to improve\ncurrent scene graph generation models.\n Haystack is 100% compatible with existing panoptic scene graph datasets and\ncan easily be integrated with existing evaluation pipelines. Our dataset and\ncode can be found here: https://lorjul.github.io/haystack/. It includes\nannotation files and simple to use scripts and utilities, to help with\nintegrating our dataset in existing work.\n","authors":["Julian Lorenz","Florian Barthel","Daniel Kienzle","Rainer Lienhart"],"pdf_url":"https://arxiv.org/pdf/2309.02286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02285v1","updated":"2023-09-05T14:45:27Z","published":"2023-09-05T14:45:27Z","title":"PromptTTS 2: Describing and Generating Voices with Text Prompt","summary":" Speech conveys more information than just text, as the same word can be\nuttered in various voices to convey diverse information. Compared to\ntraditional text-to-speech (TTS) methods relying on speech prompts (reference\nspeech) for voice variability, using text prompts (descriptions) is more\nuser-friendly since speech prompts can be hard to find or may not exist at all.\nTTS approaches based on the text prompt face two challenges: 1) the one-to-many\nproblem, where not all details about voice variability can be described in the\ntext prompt, and 2) the limited availability of text prompt datasets, where\nvendors and large cost of data labeling are required to write text prompt for\nspeech. In this work, we introduce PromptTTS 2 to address these challenges with\na variation network to provide variability information of voice not captured by\ntext prompts, and a prompt generation pipeline to utilize the large language\nmodels (LLM) to compose high quality text prompts. Specifically, the variation\nnetwork predicts the representation extracted from the reference speech (which\ncontains full information about voice) based on the text prompt representation.\nFor the prompt generation pipeline, it generates text prompts for speech with a\nspeech understanding model to recognize voice attributes (e.g., gender, speed)\nfrom speech and a large language model to formulate text prompt based on the\nrecognition results. Experiments on a large-scale (44K hours) speech dataset\ndemonstrate that compared to the previous works, PromptTTS 2 generates voices\nmore consistent with text prompts and supports the sampling of diverse voice\nvariability, thereby offering users more choices on voice generation.\nAdditionally, the prompt generation pipeline produces high-quality prompts,\neliminating the large labeling cost. The demo page of PromptTTS 2 is available\nonline\\footnote{https://speechresearch.github.io/prompttts2}.\n","authors":["Yichong Leng","Zhifang Guo","Kai Shen","Xu Tan","Zeqian Ju","Yanqing Liu","Yufei Liu","Dongchao Yang","Leying Zhang","Kaitao Song","Lei He","Xiang-Yang Li","Sheng Zhao","Tao Qin","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2309.02285v1.pdf","comment":"Demo page: https://speechresearch.github.io/prompttts2"},{"id":"http://arxiv.org/abs/2309.02281v1","updated":"2023-09-05T14:43:10Z","published":"2023-09-05T14:43:10Z","title":"s-ID: Causal Effect Identification in a Sub-Population","summary":" Causal inference in a sub-population involves identifying the causal effect\nof an intervention on a specific subgroup within a larger population. However,\nignoring the subtleties introduced by sub-populations can either lead to\nerroneous inference or limit the applicability of existing methods. We\nintroduce and advocate for a causal inference problem in sub-populations\n(henceforth called s-ID), in which we merely have access to observational data\nof the targeted sub-population (as opposed to the entire population). Existing\ninference problems in sub-populations operate on the premise that the given\ndata distributions originate from the entire population, thus, cannot tackle\nthe s-ID problem. To address this gap, we provide necessary and sufficient\nconditions that must hold in the causal graph for a causal effect in a\nsub-population to be identifiable from the observational distribution of that\nsub-population. Given these conditions, we present a sound and complete\nalgorithm for the s-ID problem.\n","authors":["Amir Mohammad Abouei","Ehsan Mokhtarian","Negar Kiyavash"],"pdf_url":"https://arxiv.org/pdf/2309.02281v1.pdf","comment":"22 pages, 14 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.02274v1","updated":"2023-09-05T14:39:27Z","published":"2023-09-05T14:39:27Z","title":"A Comparison of Residual-based Methods on Fault Detection","summary":" An important initial step in fault detection for complex industrial systems\nis gaining an understanding of their health condition. Subsequently, continuous\nmonitoring of this health condition becomes crucial to observe its evolution,\ntrack changes over time, and isolate faults. As faults are typically rare\noccurrences, it is essential to perform this monitoring in an unsupervised\nmanner. Various approaches have been proposed not only to detect faults in an\nunsupervised manner but also to distinguish between different potential fault\ntypes. In this study, we perform a comprehensive comparison between two\nresidual-based approaches: autoencoders, and the input-output models that\nestablish a mapping between operating conditions and sensor readings. We\nexplore the sensor-wise residuals and aggregated residuals for the entire\nsystem in both methods. The performance evaluation focuses on three tasks:\nhealth indicator construction, fault detection, and health indicator\ninterpretation. To perform the comparison, we utilize the Commercial Modular\nAero-Propulsion System Simulation (C-MAPSS) dynamical model, specifically a\nsubset of the turbofan engine dataset containing three different fault types.\nAll models are trained exclusively on healthy data. Fault detection is achieved\nby applying a threshold that is determined based on the healthy condition. The\ndetection results reveal that both models are capable of detecting faults with\nan average delay of around 20 cycles and maintain a low false positive rate.\nWhile the fault detection performance is similar for both models, the\ninput-output model provides better interpretability regarding potential fault\ntypes and the possible faulty components.\n","authors":["Chi-Ching Hsu","Gaetan Frusque","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2309.02274v1.pdf","comment":"10 pages, submitted to the 15th Annual Conference of the Prognostics\n and Health Management Society"},{"id":"http://arxiv.org/abs/2309.02272v1","updated":"2023-09-05T14:37:31Z","published":"2023-09-05T14:37:31Z","title":"Graph-Based Automatic Feature Selection for Multi-Class Classification\n via Mean Simplified Silhouette","summary":" This paper introduces a novel graph-based filter method for automatic feature\nselection (abbreviated as GB-AFS) for multi-class classification tasks. The\nmethod determines the minimum combination of features required to sustain\nprediction performance while maintaining complementary discriminating abilities\nbetween different classes. It does not require any user-defined parameters such\nas the number of features to select. The methodology employs the\nJeffries-Matusita (JM) distance in conjunction with t-distributed Stochastic\nNeighbor Embedding (t-SNE) to generate a low-dimensional space reflecting how\neffectively each feature can differentiate between each pair of classes. The\nminimum number of features is selected using our newly developed Mean\nSimplified Silhouette (abbreviated as MSS) index, designed to evaluate the\nclustering results for the feature selection task. Experimental results on\npublic data sets demonstrate the superior performance of the proposed GB-AFS\nover other filter-based techniques and automatic feature selection approaches.\nMoreover, the proposed algorithm maintained the accuracy achieved when\nutilizing all features, while using only $7\\%$ to $30\\%$ of the features.\nConsequently, this resulted in a reduction of the time needed for\nclassifications, from $15\\%$ to $70\\%$.\n","authors":["David Levin","Gonen Singer"],"pdf_url":"https://arxiv.org/pdf/2309.02272v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.00367v2","updated":"2023-09-05T14:35:20Z","published":"2023-09-01T09:47:33Z","title":"Where Did the Gap Go? Reassessing the Long-Range Graph Benchmark","summary":" The recent Long-Range Graph Benchmark (LRGB, Dwivedi et al. 2022) introduced\na set of graph learning tasks strongly dependent on long-range interaction\nbetween vertices. Empirical evidence suggests that on these tasks Graph\nTransformers significantly outperform Message Passing GNNs (MPGNNs). In this\npaper, we carefully reevaluate multiple MPGNN baselines as well as the Graph\nTransformer GPS (Ramp\\'a\\v{s}ek et al. 2022) on LRGB. Through a rigorous\nempirical analysis, we demonstrate that the reported performance gap is\noverestimated due to suboptimal hyperparameter choices. It is noteworthy that\nacross multiple datasets the performance gap completely vanishes after basic\nhyperparameter optimization. In addition, we discuss the impact of lacking\nfeature normalization for LRGB's vision datasets and highlight a spurious\nimplementation of LRGB's link prediction metric. The principal aim of our paper\nis to establish a higher standard of empirical rigor within the graph machine\nlearning community.\n","authors":["Jan Tönshoff","Martin Ritzert","Eran Rosenbluth","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2309.00367v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12506v2","updated":"2023-09-05T14:30:14Z","published":"2022-11-22T01:48:25Z","title":"Dynamic Loss For Robust Learning","summary":" Label noise and class imbalance commonly coexist in real-world data. Previous\nworks for robust learning, however, usually address either one type of the data\nbiases and underperform when facing them both. To mitigate this gap, this work\npresents a novel meta-learning based dynamic loss that automatically adjusts\nthe objective functions with the training process to robustly learn a\nclassifier from long-tailed noisy data. Concretely, our dynamic loss comprises\na label corrector and a margin generator, which respectively correct noisy\nlabels and generate additive per-class classification margins by perceiving the\nunderlying data distribution as well as the learning state of the classifier.\nEquipped with a new hierarchical sampling strategy that enriches a small amount\nof unbiased metadata with diverse and hard samples, the two components in the\ndynamic loss are optimized jointly through meta-learning and cultivate the\nclassifier to well adapt to clean and balanced test data. Extensive experiments\nshow our method achieves state-of-the-art accuracy on multiple real-world and\nsynthetic datasets with various types of data biases, including CIFAR-10/100,\nAnimal-10N, ImageNet-LT, and Webvision. Code will soon be publicly available.\n","authors":["Shenwang Jiang","Jianan Li","Jizhou Zhang","Ying Wang","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2211.12506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02253v1","updated":"2023-09-05T14:05:37Z","published":"2023-09-05T14:05:37Z","title":"MA-VAE: Multi-head Attention-based Variational Autoencoder Approach for\n Anomaly Detection in Multivariate Time-series Applied to Automotive Endurance\n Powertrain Testing","summary":" A clear need for automatic anomaly detection applied to automotive testing\nhas emerged as more and more attention is paid to the data recorded and manual\nevaluation by humans reaches its capacity. Such real-world data is massive,\ndiverse, multivariate and temporal in nature, therefore requiring modelling of\nthe testee behaviour. We propose a variational autoencoder with multi-head\nattention (MA-VAE), which, when trained on unlabelled data, not only provides\nvery few false positives but also manages to detect the majority of the\nanomalies presented. In addition to that, the approach offers a novel way to\navoid the bypass phenomenon, an undesirable behaviour investigated in\nliterature. Lastly, the approach also introduces a new method to remap\nindividual windows to a continuous time series. The results are presented in\nthe context of a real-world industrial data set and several experiments are\nundertaken to further investigate certain aspects of the proposed model. When\nconfigured properly, it is 9% of the time wrong when an anomaly is flagged and\ndiscovers 67% of the anomalies present. Also, MA-VAE has the potential to\nperform well with only a fraction of the training and validation subset,\nhowever, to extract it, a more sophisticated threshold estimation method is\nrequired.\n","authors":["Lucas Correia","Jan-Christoph Goos","Philipp Klein","Thomas Bäck","Anna V. Kononova"],"pdf_url":"https://arxiv.org/pdf/2309.02253v1.pdf","comment":"Accepted in NCTA2023"},{"id":"http://arxiv.org/abs/2309.00007v2","updated":"2023-09-05T14:04:14Z","published":"2023-07-27T13:18:47Z","title":"When Measures are Unreliable: Imperceptible Adversarial Perturbations\n toward Top-$k$ Multi-Label Learning","summary":" With the great success of deep neural networks, adversarial learning has\nreceived widespread attention in various studies, ranging from multi-class\nlearning to multi-label learning. However, existing adversarial attacks toward\nmulti-label learning only pursue the traditional visual imperceptibility but\nignore the new perceptible problem coming from measures such as Precision@$k$\nand mAP@$k$. Specifically, when a well-trained multi-label classifier performs\nfar below the expectation on some samples, the victim can easily realize that\nthis performance degeneration stems from attack, rather than the model itself.\nTherefore, an ideal multi-labeling adversarial attack should manage to not only\ndeceive visual perception but also evade monitoring of measures. To this end,\nthis paper first proposes the concept of measure imperceptibility. Then, a\nnovel loss function is devised to generate such adversarial perturbations that\ncould achieve both visual and measure imperceptibility. Furthermore, an\nefficient algorithm, which enjoys a convex objective, is established to\noptimize this objective. Finally, extensive experiments on large-scale\nbenchmark datasets, such as PASCAL VOC 2012, MS COCO, and NUS WIDE, demonstrate\nthe superiority of our proposed method in attacking the top-$k$ multi-label\nsystems.\n","authors":["Yuchen Sun","Qianqian Xu","Zitai Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2309.00007v2.pdf","comment":"22 pages, 7 figures, accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.02250v1","updated":"2023-09-05T13:59:50Z","published":"2023-09-05T13:59:50Z","title":"RoBoSS: A Robust, Bounded, Sparse, and Smooth Loss Function for\n Supervised Learning","summary":" In the domain of machine learning algorithms, the significance of the loss\nfunction is paramount, especially in supervised learning tasks. It serves as a\nfundamental pillar that profoundly influences the behavior and efficacy of\nsupervised learning algorithms. Traditional loss functions, while widely used,\noften struggle to handle noisy and high-dimensional data, impede model\ninterpretability, and lead to slow convergence during training. In this paper,\nwe address the aforementioned constraints by proposing a novel robust, bounded,\nsparse, and smooth (RoBoSS) loss function for supervised learning. Further, we\nincorporate the RoBoSS loss function within the framework of support vector\nmachine (SVM) and introduce a new robust algorithm named\n$\\mathcal{L}_{rbss}$-SVM. For the theoretical analysis, the\nclassification-calibrated property and generalization ability are also\npresented. These investigations are crucial for gaining deeper insights into\nthe performance of the RoBoSS loss function in the classification tasks and its\npotential to generalize well to unseen data. To empirically demonstrate the\neffectiveness of the proposed $\\mathcal{L}_{rbss}$-SVM, we evaluate it on $88$\nreal-world UCI and KEEL datasets from diverse domains. Additionally, to\nexemplify the effectiveness of the proposed $\\mathcal{L}_{rbss}$-SVM within the\nbiomedical realm, we evaluated it on two medical datasets: the\nelectroencephalogram (EEG) signal dataset and the breast cancer (BreaKHis)\ndataset. The numerical results substantiate the superiority of the proposed\n$\\mathcal{L}_{rbss}$-SVM model, both in terms of its remarkable generalization\nperformance and its efficiency in training time.\n","authors":["Mushir Akhtar","M. Tanveer","Mohd. Arshad"],"pdf_url":"https://arxiv.org/pdf/2309.02250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02248v1","updated":"2023-09-05T13:58:59Z","published":"2023-09-05T13:58:59Z","title":"Encoding Seasonal Climate Predictions for Demand Forecasting with\n Modular Neural Network","summary":" Current time-series forecasting problems use short-term weather attributes as\nexogenous inputs. However, in specific time-series forecasting solutions (e.g.,\ndemand prediction in the supply chain), seasonal climate predictions are\ncrucial to improve its resilience. Representing mid to long-term seasonal\nclimate forecasts is challenging as seasonal climate predictions are uncertain,\nand encoding spatio-temporal relationship of climate forecasts with demand is\ncomplex.\n We propose a novel modeling framework that efficiently encodes seasonal\nclimate predictions to provide robust and reliable time-series forecasting for\nsupply chain functions. The encoding framework enables effective learning of\nlatent representations -- be it uncertain seasonal climate prediction or other\ntime-series data (e.g., buyer patterns) -- via a modular neural network\narchitecture. Our extensive experiments indicate that learning such\nrepresentations to model seasonal climate forecast results in an error\nreduction of approximately 13\\% to 17\\% across multiple real-world data sets\ncompared to existing demand forecasting methods.\n","authors":["Smit Marvaniya","Jitendra Singh","Nicolas Galichet","Fred Ochieng Otieno","Geeth De Mel","Kommy Weldemariam"],"pdf_url":"https://arxiv.org/pdf/2309.02248v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2309.02243v1","updated":"2023-09-05T13:49:29Z","published":"2023-09-05T13:49:29Z","title":"Self-Similarity-Based and Novelty-based loss for music structure\n analysis","summary":" Music Structure Analysis (MSA) is the task aiming at identifying musical\nsegments that compose a music track and possibly label them based on their\nsimilarity. In this paper we propose a supervised approach for the task of\nmusic boundary detection. In our approach we simultaneously learn features and\nconvolution kernels. For this we jointly optimize -- a loss based on the\nSelf-Similarity-Matrix (SSM) obtained with the learned features, denoted by\nSSM-loss, and -- a loss based on the novelty score obtained applying the\nlearned kernels to the estimated SSM, denoted by novelty-loss. We also\ndemonstrate that relative feature learning, through self-attention, is\nbeneficial for the task of MSA. Finally, we compare the performances of our\napproach to previously proposed approaches on the standard RWC-Pop, and various\nsubsets of SALAMI.\n","authors":["Geoffroy Peeters"],"pdf_url":"https://arxiv.org/pdf/2309.02243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02237v1","updated":"2023-09-05T13:42:43Z","published":"2023-09-05T13:42:43Z","title":"Sample Size in Natural Language Processing within Healthcare Research","summary":" Sample size calculation is an essential step in most data-based disciplines.\nLarge enough samples ensure representativeness of the population and determine\nthe precision of estimates. This is true for most quantitative studies,\nincluding those that employ machine learning methods, such as natural language\nprocessing, where free-text is used to generate predictions and classify\ninstances of text. Within the healthcare domain, the lack of sufficient corpora\nof previously collected data can be a limiting factor when determining sample\nsizes for new studies. This paper tries to address the issue by making\nrecommendations on sample sizes for text classification tasks in the healthcare\ndomain.\n Models trained on the MIMIC-III database of critical care records from Beth\nIsrael Deaconess Medical Center were used to classify documents as having or\nnot having Unspecified Essential Hypertension, the most common diagnosis code\nin the database. Simulations were performed using various classifiers on\ndifferent sample sizes and class proportions. This was repeated for a\ncomparatively less common diagnosis code within the database of diabetes\nmellitus without mention of complication.\n Smaller sample sizes resulted in better results when using a K-nearest\nneighbours classifier, whereas larger sample sizes provided better results with\nsupport vector machines and BERT models. Overall, a sample size larger than\n1000 was sufficient to provide decent performance metrics.\n The simulations conducted within this study provide guidelines that can be\nused as recommendations for selecting appropriate sample sizes and class\nproportions, and for predicting expected performance, when building classifiers\nfor textual healthcare data. The methodology used here can be modified for\nsample size estimates calculations with other datasets.\n","authors":["Jaya Chaturvedi","Diana Shamsutdinova","Felix Zimmer","Sumithra Velupillai","Daniel Stahl","Robert Stewart","Angus Roberts"],"pdf_url":"https://arxiv.org/pdf/2309.02237v1.pdf","comment":"Submitted to Journal of Biomedical Informatics"},{"id":"http://arxiv.org/abs/2309.02236v1","updated":"2023-09-05T13:42:11Z","published":"2023-09-05T13:42:11Z","title":"Distributionally Robust Model-based Reinforcement Learning with Large\n State Spaces","summary":" Three major challenges in reinforcement learning are the complex dynamical\nsystems with large state spaces, the costly data acquisition processes, and the\ndeviation of real-world dynamics from the training environment deployment. To\novercome these issues, we study distributionally robust Markov decision\nprocesses with continuous state spaces under the widely used Kullback-Leibler,\nchi-square, and total variation uncertainty sets. We propose a model-based\napproach that utilizes Gaussian Processes and the maximum variance reduction\nalgorithm to efficiently learn multi-output nominal transition dynamics,\nleveraging access to a generative model (i.e., simulator). We further\ndemonstrate the statistical sample complexity of the proposed method for\ndifferent uncertainty sets. These complexity bounds are independent of the\nnumber of states and extend beyond linear dynamics, ensuring the effectiveness\nof our approach in identifying near-optimal distributionally-robust policies.\nThe proposed method can be further combined with other model-free\ndistributionally robust reinforcement learning methods to obtain a near-optimal\nrobust policy. Experimental results demonstrate the robustness of our algorithm\nto distributional shifts and its superior performance in terms of the number of\nsamples needed.\n","authors":["Shyam Sundhar Ramesh","Pier Giuseppe Sessa","Yifan Hu","Andreas Krause","Ilija Bogunovic"],"pdf_url":"https://arxiv.org/pdf/2309.02236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11127v2","updated":"2023-09-05T13:35:04Z","published":"2023-08-22T02:17:34Z","title":"How Expressive are Graph Neural Networks in Recommendation?","summary":" Graph Neural Networks (GNNs) have demonstrated superior performance on\nvarious graph learning tasks, including recommendation, where they leverage\nuser-item collaborative filtering signals in graphs. However, theoretical\nformulations of their capability are scarce, despite their empirical\neffectiveness in state-of-the-art recommender models. Recently, research has\nexplored the expressiveness of GNNs in general, demonstrating that message\npassing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that\nGNNs combined with random node initialization are universal. Nevertheless, the\nconcept of \"expressiveness\" for GNNs remains vaguely defined. Most existing\nworks adopt the graph isomorphism test as the metric of expressiveness, but\nthis graph-level task may not effectively assess a model's ability in\nrecommendation, where the objective is to distinguish nodes of different\ncloseness. In this paper, we provide a comprehensive theoretical analysis of\nthe expressiveness of GNNs in recommendation, considering three levels of\nexpressiveness metrics: graph isomorphism (graph-level), node automorphism\n(node-level), and topological closeness (link-level). We propose the\ntopological closeness metric to evaluate GNNs' ability to capture the\nstructural distance between nodes, which aligns closely with the objective of\nrecommendation. To validate the effectiveness of this new metric in evaluating\nrecommendation performance, we introduce a learning-less GNN algorithm that is\noptimal on the new metric and can be optimal on the node-level metric with\nsuitable modification. We conduct extensive experiments comparing the proposed\nalgorithm against various types of state-of-the-art GNN models to explore the\nexplainability of the new metric in the recommendation task. For\nreproducibility, implementation codes are available at\nhttps://github.com/HKUDS/GTE.\n","authors":["Xuheng Cai","Lianghao Xia","Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11127v2.pdf","comment":"32nd ACM International Conference on Information and Knowledge\n Management (CIKM) 2023"},{"id":"http://arxiv.org/abs/2309.02214v1","updated":"2023-09-05T13:20:43Z","published":"2023-09-05T13:20:43Z","title":"Improving equilibrium propagation without weight symmetry through\n Jacobian homeostasis","summary":" Equilibrium propagation (EP) is a compelling alternative to the\nbackpropagation of error algorithm (BP) for computing gradients of neural\nnetworks on biological or analog neuromorphic substrates. Still, the algorithm\nrequires weight symmetry and infinitesimal equilibrium perturbations, i.e.,\nnudges, to estimate unbiased gradients efficiently. Both requirements are\nchallenging to implement in physical systems. Yet, whether and how weight\nasymmetry affects its applicability is unknown because, in practice, it may be\nmasked by biases introduced through the finite nudge. To address this question,\nwe study generalized EP, which can be formulated without weight symmetry, and\nanalytically isolate the two sources of bias. For complex-differentiable\nnon-symmetric networks, we show that the finite nudge does not pose a problem,\nas exact derivatives can still be estimated via a Cauchy integral. In contrast,\nweight asymmetry introduces bias resulting in low task performance due to poor\nalignment of EP's neuronal error vectors compared to BP. To mitigate this\nissue, we present a new homeostatic objective that directly penalizes\nfunctional asymmetries of the Jacobian at the network's fixed point. This\nhomeostatic objective dramatically improves the network's ability to solve\ncomplex tasks such as ImageNet 32x32. Our results lay the theoretical\ngroundwork for studying and mitigating the adverse effects of imperfections of\nphysical networks on learning algorithms that rely on the substrate's\nrelaxation dynamics.\n","authors":["Axel Laborieux","Friedemann Zenke"],"pdf_url":"https://arxiv.org/pdf/2309.02214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02211v1","updated":"2023-09-05T13:19:40Z","published":"2023-09-05T13:19:40Z","title":"Distributionally Robust Machine Learning with Multi-source Data","summary":" Classical machine learning methods may lead to poor prediction performance\nwhen the target distribution differs from the source populations. This paper\nutilizes data from multiple sources and introduces a group distributionally\nrobust prediction model defined to optimize an adversarial reward about\nexplained variance with respect to a class of target distributions. Compared to\nclassical empirical risk minimization, the proposed robust prediction model\nimproves the prediction accuracy for target populations with distribution\nshifts. We show that our group distributionally robust prediction model is a\nweighted average of the source populations' conditional outcome models. We\nleverage this key identification result to robustify arbitrary machine learning\nalgorithms, including, for example, random forests and neural networks. We\ndevise a novel bias-corrected estimator to estimate the optimal aggregation\nweight for general machine-learning algorithms and demonstrate its improvement\nin the convergence rate. Our proposal can be seen as a distributionally robust\nfederated learning approach that is computationally efficient and easy to\nimplement using arbitrary machine learning base algorithms, satisfies some\nprivacy constraints, and has a nice interpretation of different sources'\nimportance for predicting a given target covariate distribution. We demonstrate\nthe performance of our proposed group distributionally robust method on\nsimulated and real data with random forests and neural networks as\nbase-learning algorithms.\n","authors":["Zhenyu Wang","Peter Bühlmann","Zijian Guo"],"pdf_url":"https://arxiv.org/pdf/2309.02211v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02206v1","updated":"2023-09-05T13:11:40Z","published":"2023-09-05T13:11:40Z","title":"Language Models for Novelty Detection in System Call Traces","summary":" Due to the complexity of modern computer systems, novel and unexpected\nbehaviors frequently occur. Such deviations are either normal occurrences, such\nas software updates and new user activities, or abnormalities, such as\nmisconfigurations, latency issues, intrusions, and software bugs. Regardless,\nnovel behaviors are of great interest to developers, and there is a genuine\nneed for efficient and effective methods to detect them. Nowadays, researchers\nconsider system calls to be the most fine-grained and accurate source of\ninformation to investigate the behavior of computer systems. Accordingly, this\npaper introduces a novelty detection methodology that relies on a probability\ndistribution over sequences of system calls, which can be seen as a language\nmodel. Language models estimate the likelihood of sequences, and since\nnovelties deviate from previously observed behaviors by definition, they would\nbe unlikely under the model. Following the success of neural networks for\nlanguage models, three architectures are evaluated in this work: the widespread\nLSTM, the state-of-the-art Transformer, and the lower-complexity Longformer.\nHowever, large neural networks typically require an enormous amount of data to\nbe trained effectively, and to the best of our knowledge, no massive modern\ndatasets of kernel traces are publicly available. This paper addresses this\nlimitation by introducing a new open-source dataset of kernel traces comprising\nover 2 million web requests with seven distinct behaviors. The proposed\nmethodology requires minimal expert hand-crafting and achieves an F-score and\nAuROC greater than 95% on most novelties while being data- and task-agnostic.\nThe source code and trained models are publicly available on GitHub while the\ndatasets are available on Zenodo.\n","authors":["Quentin Fournier","Daniel Aloise","Leandro R. Costa"],"pdf_url":"https://arxiv.org/pdf/2309.02206v1.pdf","comment":"12 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.02202v1","updated":"2023-09-05T13:07:25Z","published":"2023-09-05T13:07:25Z","title":"On the Complexity of Differentially Private Best-Arm Identification with\n Fixed Confidence","summary":" Best Arm Identification (BAI) problems are progressively used for\ndata-sensitive applications, such as designing adaptive clinical trials, tuning\nhyper-parameters, and conducting user studies to name a few. Motivated by the\ndata privacy concerns invoked by these applications, we study the problem of\nBAI with fixed confidence under $\\epsilon$-global Differential Privacy (DP).\nFirst, to quantify the cost of privacy, we derive a lower bound on the sample\ncomplexity of any $\\delta$-correct BAI algorithm satisfying $\\epsilon$-global\nDP. Our lower bound suggests the existence of two privacy regimes depending on\nthe privacy budget $\\epsilon$. In the high-privacy regime (small $\\epsilon$),\nthe hardness depends on a coupled effect of privacy and a novel\ninformation-theoretic quantity, called the Total Variation Characteristic Time.\nIn the low-privacy regime (large $\\epsilon$), the sample complexity lower bound\nreduces to the classical non-private lower bound. Second, we propose AdaP-TT,\nan $\\epsilon$-global DP variant of the Top Two algorithm. AdaP-TT runs in\narm-dependent adaptive episodes and adds Laplace noise to ensure a good\nprivacy-utility trade-off. We derive an asymptotic upper bound on the sample\ncomplexity of AdaP-TT that matches with the lower bound up to multiplicative\nconstants in the high-privacy regime. Finally, we provide an experimental\nanalysis of AdaP-TT that validates our theoretical results.\n","authors":["Achraf Azize","Marc Jourdan","Aymen Al Marjani","Debabrota Basu"],"pdf_url":"https://arxiv.org/pdf/2309.02202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02195v1","updated":"2023-09-05T12:56:35Z","published":"2023-09-05T12:56:35Z","title":"Sparse Function-space Representation of Neural Networks","summary":" Deep neural networks (NNs) are known to lack uncertainty estimates and\nstruggle to incorporate new data. We present a method that mitigates these\nissues by converting NNs from weight space to function space, via a dual\nparameterization. Importantly, the dual parameterization enables us to\nformulate a sparse representation that captures information from the entire\ndata set. This offers a compact and principled way of capturing uncertainty and\nenables us to incorporate new data without retraining whilst retaining\npredictive performance. We provide proof-of-concept demonstrations with the\nproposed approach for quantifying uncertainty in supervised learning on UCI\nbenchmark tasks.\n","authors":["Aidan Scannell","Riccardo Mereu","Paul Chang","Ella Tamir","Joni Pajarinen","Arno Solin"],"pdf_url":"https://arxiv.org/pdf/2309.02195v1.pdf","comment":"Accepted to ICML 2023 Workshop on Duality for Modern Machine\n Learning, Honolulu, Hawaii, USA. 4 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.02193v1","updated":"2023-09-05T12:54:40Z","published":"2023-09-05T12:54:40Z","title":"Personalized Federated Deep Reinforcement Learning-based Trajectory\n Optimization for Multi-UAV Assisted Edge Computing","summary":" In the era of 5G mobile communication, there has been a significant surge in\nresearch focused on unmanned aerial vehicles (UAVs) and mobile edge computing\ntechnology. UAVs can serve as intelligent servers in edge computing\nenvironments, optimizing their flight trajectories to maximize communication\nsystem throughput. Deep reinforcement learning (DRL)-based trajectory\noptimization algorithms may suffer from poor training performance due to\nintricate terrain features and inadequate training data. To overcome this\nlimitation, some studies have proposed leveraging federated learning (FL) to\nmitigate the data isolation problem and expedite convergence. Nevertheless, the\nefficacy of global FL models can be negatively impacted by the high\nheterogeneity of local data, which could potentially impede the training\nprocess and even compromise the performance of local agents. This work proposes\na novel solution to address these challenges, namely personalized federated\ndeep reinforcement learning (PF-DRL), for multi-UAV trajectory optimization.\nPF-DRL aims to develop individualized models for each agent to address the data\nscarcity issue and mitigate the negative impact of data heterogeneity.\nSimulation results demonstrate that the proposed algorithm achieves superior\ntraining performance with faster convergence rates, and improves service\nquality compared to other DRL-based approaches.\n","authors":["Zhengrong Song","Chuan Ma","Ming Ding","Howard H. Yang","Yuwen Qian","Xiangwei Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.02193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02189v1","updated":"2023-09-05T12:48:21Z","published":"2023-09-05T12:48:21Z","title":"Leveraging BERT Language Models for Multi-Lingual ESG Issue\n Identification","summary":" Environmental, Social, and Governance (ESG) has been used as a metric to\nmeasure the negative impacts and enhance positive outcomes of companies in\nareas such as the environment, society, and governance. Recently, investors\nhave increasingly recognized the significance of ESG criteria in their\ninvestment choices, leading businesses to integrate ESG principles into their\noperations and strategies. The Multi-Lingual ESG Issue Identification (ML-ESG)\nshared task encompasses the classification of news documents into 35 distinct\nESG issue labels. In this study, we explored multiple strategies harnessing\nBERT language models to achieve accurate classification of news documents\nacross these labels. Our analysis revealed that the RoBERTa classifier emerged\nas one of the most successful approaches, securing the second-place position\nfor the English test dataset, and sharing the fifth-place position for the\nFrench test dataset. Furthermore, our SVM-based binary model tailored for the\nChinese language exhibited exceptional performance, earning the second-place\nrank on the test dataset.\n","authors":["Elvys Linhares Pontes","Mohamed Benjannet","Lam Kim Ming"],"pdf_url":"https://arxiv.org/pdf/2309.02189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11509v5","updated":"2023-09-05T12:31:03Z","published":"2023-05-19T08:18:49Z","title":"From Random Search to Bandit Learning in Metric Measure Spaces","summary":" Random Search is one of the most widely-used method for Hyperparameter\nOptimization, and is critical to the success of deep learning models. Despite\nits astonishing performance, little non-heuristic theory has been developed to\ndescribe the underlying working mechanism. This paper gives a theoretical\naccounting of Random Search. We introduce the concept of scattering dimension\nthat describes the landscape of the underlying function, and quantifies the\nperformance of random search. We show that, when the environment is noise-free,\nthe output of random search converges to the optimal value in probability at\nrate $ \\widetilde{\\mathcal{O}} \\left( \\left( \\frac{1}{T} \\right)^{\n\\frac{1}{d_s} } \\right) $, where $ d_s \\ge 0 $ is the scattering dimension of\nthe underlying function. When the observed function values are corrupted by\nbounded $iid$ noise, the output of random search converges to the optimal value\nin probability at rate $ \\widetilde{\\mathcal{O}} \\left( \\left( \\frac{1}{T}\n\\right)^{ \\frac{1}{d_s + 1} } \\right) $. In addition, based on the principles\nof random search, we introduce an algorithm, called BLiN-MOS, for Lipschitz\nbandits in doubling metric spaces that are also endowed with a probability\nmeasure, and show that under certain conditions, BLiN-MOS achieves a regret\nrate of order $ \\widetilde{\\mathcal{O}} \\left( T^{ \\frac{d_z}{d_z + 1} }\n\\right) $, where $d_z$ is the zooming dimension of the problem instance.\n","authors":["Chuying Han","Yasong Feng","Tianyu Wang"],"pdf_url":"https://arxiv.org/pdf/2305.11509v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11048v5","updated":"2023-09-05T12:10:51Z","published":"2022-06-22T13:12:54Z","title":"Automated GI tract segmentation using deep learning","summary":" The job of Radiation oncologists is to deliver x-ray beams pointed toward the\ntumor and at the same time avoid the stomach and intestines. With MR-Linacs\n(magnetic resonance imaging and linear accelerator systems), oncologists can\nvisualize the position of the tumor and allow for precise dose according to\ntumor cell presence which can vary from day to day. The current job of\noutlining the position of the stomach and intestines to adjust the X-ray beams\ndirection for the dose delivery to the tumor while avoiding the organs. This is\na time-consuming and labor-intensive process that can easily prolong treatments\nfrom 15 minutes to an hour a day unless deep learning methods can automate the\nsegmentation process. This paper discusses an automated segmentation process\nusing deep learning to make this process faster and allow more patients to get\neffective treatment.\n","authors":["Manhar Sharma"],"pdf_url":"https://arxiv.org/pdf/2206.11048v5.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02160v1","updated":"2023-09-05T11:55:03Z","published":"2023-09-05T11:55:03Z","title":"Bias Propagation in Federated Learning","summary":" We show that participating in federated learning can be detrimental to group\nfairness. In fact, the bias of a few parties against under-represented groups\n(identified by sensitive attributes such as gender or race) can propagate\nthrough the network to all the parties in the network. We analyze and explain\nbias propagation in federated learning on naturally partitioned real-world\ndatasets. Our analysis reveals that biased parties unintentionally yet\nstealthily encode their bias in a small number of model parameters, and\nthroughout the training, they steadily increase the dependence of the global\nmodel on sensitive attributes. What is important to highlight is that the\nexperienced bias in federated learning is higher than what parties would\notherwise encounter in centralized training with a model trained on the union\nof all their data. This indicates that the bias is due to the algorithm. Our\nwork calls for auditing group fairness in federated learning and designing\nlearning algorithms that are robust to bias propagation.\n","authors":["Hongyan Chang","Reza Shokri"],"pdf_url":"https://arxiv.org/pdf/2309.02160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02157v1","updated":"2023-09-05T11:49:33Z","published":"2023-09-05T11:49:33Z","title":"Model-based Offline Policy Optimization with Adversarial Network","summary":" Model-based offline reinforcement learning (RL), which builds a supervised\ntransition model with logging dataset to avoid costly interactions with the\nonline environment, has been a promising approach for offline policy\noptimization. As the discrepancy between the logging data and online\nenvironment may result in a distributional shift problem, many prior works have\nstudied how to build robust transition models conservatively and estimate the\nmodel uncertainty accurately. However, the over-conservatism can limit the\nexploration of the agent, and the uncertainty estimates may be unreliable. In\nthis work, we propose a novel Model-based Offline policy optimization framework\nwith Adversarial Network (MOAN). The key idea is to use adversarial learning to\nbuild a transition model with better generalization, where an adversary is\nintroduced to distinguish between in-distribution and out-of-distribution\nsamples. Moreover, the adversary can naturally provide a quantification of the\nmodel's uncertainty with theoretical guarantees. Extensive experiments showed\nthat our approach outperforms existing state-of-the-art baselines on widely\nstudied offline RL benchmarks. It can also generate diverse in-distribution\nsamples, and quantify the uncertainty more accurately.\n","authors":["Junming Yang","Xingguo Chen","Shengyuan Wang","Bolei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.02157v1.pdf","comment":"Accepted by 26th European Conference on Artificial Intelligence ECAI\n 2023"},{"id":"http://arxiv.org/abs/2205.04449v2","updated":"2023-09-05T11:42:07Z","published":"2022-05-09T17:51:44Z","title":"Introspective Deep Metric Learning for Image Retrieval","summary":" This paper proposes an introspective deep metric learning (IDML) framework\nfor uncertainty-aware comparisons of images. Conventional deep metric learning\nmethods produce confident semantic distances between images regardless of the\nuncertainty level. However, we argue that a good similarity model should\nconsider the semantic discrepancies with caution to better deal with ambiguous\nimages for more robust training. To achieve this, we propose to represent an\nimage using not only a semantic embedding but also an accompanying uncertainty\nembedding, which describes the semantic characteristics and ambiguity of an\nimage, respectively. We further propose an introspective similarity metric to\nmake similarity judgments between images considering both their semantic\ndifferences and ambiguities. The proposed IDML framework improves the\nperformance of deep metric learning through uncertainty modeling and attains\nstate-of-the-art results on the widely used CUB-200-2011, Cars196, and Stanford\nOnline Products datasets for image retrieval and clustering. We further provide\nan in-depth analysis of our framework to demonstrate the effectiveness and\nreliability of IDML. Code is available at: https://github.com/wzzheng/IDML.\n","authors":["Wenzhao Zheng","Chengkun Wang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2205.04449v2.pdf","comment":"The extended version of this paper is accepted to T-PAMI. Source code\n available at https://github.com/wzzheng/IDML"},{"id":"http://arxiv.org/abs/2309.02144v1","updated":"2023-09-05T11:32:48Z","published":"2023-09-05T11:32:48Z","title":"Making Large Language Models Better Reasoners with Alignment","summary":" Reasoning is a cognitive process of using evidence to reach a sound\nconclusion. The reasoning capability is essential for large language models\n(LLMs) to serve as the brain of the artificial general intelligence agent.\nRecent studies reveal that fine-tuning LLMs on data with the chain of thought\n(COT) reasoning process can significantly enhance their reasoning capabilities.\nHowever, we find that the fine-tuned LLMs suffer from an \\textit{Assessment\nMisalignment} problem, i.e., they frequently assign higher scores to subpar\nCOTs, leading to potential limitations in their reasoning abilities. To address\nthis problem, we introduce an \\textit{Alignment Fine-Tuning (AFT)} paradigm,\nwhich involves three steps: 1) fine-tuning LLMs with COT training data; 2)\ngenerating multiple COT responses for each question, and categorizing them into\npositive and negative ones based on whether they achieve the correct answer; 3)\ncalibrating the scores of positive and negative responses given by LLMs with a\nnovel constraint alignment loss. Specifically, the constraint alignment loss\nhas two objectives: a) Alignment, which guarantees that positive scores surpass\nnegative scores to encourage answers with high-quality COTs; b) Constraint,\nwhich keeps the negative scores confined to a reasonable range to prevent the\nmodel degradation. Beyond just the binary positive and negative feedback, the\nconstraint alignment loss can be seamlessly adapted to the ranking situations\nwhen ranking feedback is accessible. Furthermore, we also delve deeply into\nrecent ranking-based alignment methods, such as DPO, RRHF, and PRO, and\ndiscover that the constraint, which has been overlooked by these approaches, is\nalso crucial for their performance. Extensive experiments on four reasoning\nbenchmarks with both binary and ranking feedback demonstrate the effectiveness\nof AFT.\n","authors":["Peiyi Wang","Lei Li","Liang Chen","Feifan Song","Binghuai Lin","Yunbo Cao","Tianyu Liu","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2309.02144v1.pdf","comment":"Large Language Models; Reasoning; Alignment"},{"id":"http://arxiv.org/abs/2309.02140v1","updated":"2023-09-05T11:30:38Z","published":"2023-09-05T11:30:38Z","title":"A Lightweight, Rapid and Efficient Deep Convolutional Network for Chest\n X-Ray Tuberculosis Detection","summary":" Tuberculosis (TB) is still recognized as one of the leading causes of death\nworldwide. Recent advances in deep learning (DL) have shown to enhance\nradiologists' ability to interpret chest X-ray (CXR) images accurately and with\nfewer errors, leading to a better diagnosis of this disease. However, little\nwork has been done to develop models capable of diagnosing TB that offer good\nperformance while being efficient, fast and computationally inexpensive. In\nthis work, we propose LightTBNet, a novel lightweight, fast and efficient deep\nconvolutional network specially customized to detect TB from CXR images. Using\na total of 800 frontal CXR images from two publicly available datasets, our\nsolution yielded an accuracy, F1 and area under the ROC curve (AUC) of 0.906,\n0.907 and 0.961, respectively, on an independent test subset. The proposed\nmodel demonstrates outstanding performance while delivering a rapid prediction,\nwith minimal computational and memory requirements, making it highly suitable\nfor deployment in handheld devices that can be used in low-resource areas with\nhigh TB prevalence. Code publicly available at\nhttps://github.com/dani-capellan/LightTBNet.\n","authors":["Daniel Capellán-Martín","Juan J. Gómez-Valverde","David Bermejo-Peláez","María J. Ledesma-Carbayo"],"pdf_url":"https://arxiv.org/pdf/2309.02140v1.pdf","comment":"5 pages, 3 figures, 3 tables. This paper has been accepted at ISBI\n 2023"},{"id":"http://arxiv.org/abs/2309.02138v1","updated":"2023-09-05T11:29:25Z","published":"2023-09-05T11:29:25Z","title":"Generalized Simplicial Attention Neural Networks","summary":" The aim of this work is to introduce Generalized Simplicial Attention Neural\nNetworks (GSANs), i.e., novel neural architectures designed to process data\ndefined on simplicial complexes using masked self-attentional layers. Hinging\non topological signal processing principles, we devise a series of\nself-attention schemes capable of processing data components defined at\ndifferent simplicial orders, such as nodes, edges, triangles, and beyond. These\nschemes learn how to weight the neighborhoods of the given topological domain\nin a task-oriented fashion, leveraging the interplay among simplices of\ndifferent orders through the Dirac operator and its Dirac decomposition. We\nalso theoretically establish that GSANs are permutation equivariant and\nsimplicial-aware. Finally, we illustrate how our approach compares favorably\nwith other methods when applied to several (inductive and transductive) tasks\nsuch as trajectory prediction, missing data imputation, graph classification,\nand simplex prediction.\n","authors":["Claudio Battiloro","Lucia Testa","Lorenzo Giusti","Stefania Sardellitti","Paolo Di Lorenzo","Sergio Barbarossa"],"pdf_url":"https://arxiv.org/pdf/2309.02138v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2203.07485"},{"id":"http://arxiv.org/abs/2308.06733v2","updated":"2023-09-05T11:18:56Z","published":"2023-08-13T09:51:16Z","title":"Precipitation nowcasting with generative diffusion models","summary":" In recent years traditional numerical methods for accurate weather prediction\nhave been increasingly challenged by deep learning methods. Numerous historical\ndatasets used for short and medium-range weather forecasts are typically\norganized into a regular spatial grid structure. This arrangement closely\nresembles images: each weather variable can be visualized as a map or, when\nconsidering the temporal axis, as a video. Several classes of generative\nmodels, comprising Generative Adversarial Networks, Variational Autoencoders,\nor the recent Denoising Diffusion Models have largely proved their\napplicability to the next-frame prediction problem, and is thus natural to test\ntheir performance on the weather prediction benchmarks. Diffusion models are\nparticularly appealing in this context, due to the intrinsically probabilistic\nnature of weather forecasting: what we are really interested to model is the\nprobability distribution of weather indicators, whose expected value is the\nmost likely prediction.\n In our study, we focus on a specific subset of the ERA-5 dataset, which\nincludes hourly data pertaining to Central Europe from the years 2016 to 2021.\nWithin this context, we examine the efficacy of diffusion models in handling\nthe task of precipitation nowcasting. Our work is conducted in comparison to\nthe performance of well-established U-Net models, as documented in the existing\nliterature. Our proposed approach of Generative Ensemble Diffusion (GED)\nutilizes a diffusion model to generate a set of possible weather scenarios\nwhich are then amalgamated into a probable prediction via the use of a\npost-processing network. This approach, in comparison to recent deep learning\nmodels, substantially outperformed them in terms of overall performance.\n","authors":["Andrea Asperti","Fabio Merizzi","Alberto Paparella","Giorgio Pedrazzi","Matteo Angelinelli","Stefano Colamonaco"],"pdf_url":"https://arxiv.org/pdf/2308.06733v2.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.02130v1","updated":"2023-09-05T11:16:47Z","published":"2023-09-05T11:16:47Z","title":"A Simple Asymmetric Momentum Make SGD Greatest Again","summary":" We propose the simplest SGD enhanced method ever, Loss-Controlled Asymmetric\nMomentum(LCAM), aimed directly at the Saddle Point problem. Compared to the\ntraditional SGD with Momentum, there's no increase in computational demand, yet\nit outperforms all current optimizers. We use the concepts of weight\nconjugation and traction effect to explain this phenomenon. We designed\nexperiments to rapidly reduce the learning rate at specified epochs to trap\nparameters more easily at saddle points. We selected WRN28-10 as the test\nnetwork and chose cifar10 and cifar100 as test datasets, an identical group to\nthe original paper of WRN and Cosine Annealing Scheduling(CAS). We compared the\nability to bypass saddle points of Asymmetric Momentum with different\npriorities. Finally, using WRN28-10 on Cifar100, we achieved a peak average\ntest accuracy of 80.78\\% around 120 epoch. For comparison, the original WRN\npaper reported 80.75\\%, while CAS was at 80.42\\%, all at 200 epoch. This means\nthat while potentially increasing accuracy, we use nearly half convergence\ntime. Our demonstration code is available at\\\\\nhttps://github.com/hakumaicc/Asymmetric-Momentum-LCAM\n","authors":["Gongyue Zhang","Dinghuang Zhang","Shuwen Zhao","Donghan Liu","Carrie M. Toptan","Honghai Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.03183v2","updated":"2023-09-05T11:11:14Z","published":"2023-01-09T06:32:11Z","title":"Minimax Weight Learning for Absorbing MDPs","summary":" Reinforcement learning policy evaluation problems are often modeled as finite\nor discounted/averaged infinite-horizon MDPs. In this paper, we study\nundiscounted off-policy policy evaluation for absorbing MDPs. Given the dataset\nconsisting of the i.i.d episodes with a given truncation level, we propose a\nso-called MWLA algorithm to directly estimate the expected return via the\nimportance ratio of the state-action occupancy measure. The Mean Square Error\n(MSE) bound for the MWLA method is investigated and the dependence of\nstatistical errors on the data size and the truncation level are analyzed. With\nan episodic taxi environment, computational experiments illustrate the\nperformance of the MWLA algorithm.\n","authors":["Fengyin Li","Yuqiang Li","Xianyi Wu"],"pdf_url":"https://arxiv.org/pdf/2301.03183v2.pdf","comment":"36 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02124v1","updated":"2023-09-05T11:01:30Z","published":"2023-09-05T11:01:30Z","title":"Exploiting Spatial-temporal Data for Sleep Stage Classification via\n Hypergraph Learning","summary":" Sleep stage classification is crucial for detecting patients' health\nconditions. Existing models, which mainly use Convolutional Neural Networks\n(CNN) for modelling Euclidean data and Graph Convolution Networks (GNN) for\nmodelling non-Euclidean data, are unable to consider the heterogeneity and\ninteractivity of multimodal data as well as the spatial-temporal correlation\nsimultaneously, which hinders a further improvement of classification\nperformance. In this paper, we propose a dynamic learning framework STHL, which\nintroduces hypergraph to encode spatial-temporal data for sleep stage\nclassification. Hypergraphs can construct multi-modal/multi-type data instead\nof using simple pairwise between two subjects. STHL creates spatial and\ntemporal hyperedges separately to build node correlations, then it conducts\ntype-specific hypergraph learning process to encode the attributes into the\nembedding space. Extensive experiments show that our proposed STHL outperforms\nthe state-of-the-art models in sleep stage classification tasks.\n","authors":["Yuze Liu","Ziming Zhao","Tiehua Zhang","Kang Wang","Xin Chen","Xiaowei Huang","Jun Yin","Zhishu Shen"],"pdf_url":"https://arxiv.org/pdf/2309.02124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.10587v2","updated":"2023-09-05T10:46:44Z","published":"2022-02-17T06:18:02Z","title":"Knowledge-informed Molecular Learning: A Survey on Paradigm Transfer","summary":" Machine learning, notably deep learning, has significantly propelled\nmolecular investigations within the biochemical sphere. Traditionally, modeling\nfor such research has centered around a handful of paradigms. For instance, the\nprediction paradigm is frequently deployed for tasks such as molecular property\nprediction. To enhance the generation and decipherability of purely data-driven\nmodels, scholars have integrated biochemical domain knowledge into these\nmolecular study models. This integration has sparked a surge in paradigm\ntransfer, which is solving one molecular learning task by reformulating it as\nanother one. With the emergence of Large Language Models, these paradigms have\ndemonstrated an escalating trend towards harmonized unification. In this work,\nwe delineate a literature survey focused on knowledge-informed molecular\nlearning from the perspective of paradigm transfer. We classify the paradigms,\nscrutinize their methodologies, and dissect the contribution of domain\nknowledge. Moreover, we encapsulate prevailing trends and identify intriguing\navenues for future exploration in molecular learning.\n","authors":["Yin Fang","Zhuo Chen","Xiaohui Fan","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2202.10587v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.02106v1","updated":"2023-09-05T10:26:32Z","published":"2023-09-05T10:26:32Z","title":"Leveraging Label Information for Multimodal Emotion Recognition","summary":" Multimodal emotion recognition (MER) aims to detect the emotional status of a\ngiven expression by combining the speech and text information. Intuitively,\nlabel information should be capable of helping the model locate the salient\ntokens/frames relevant to the specific emotion, which finally facilitates the\nMER task. Inspired by this, we propose a novel approach for MER by leveraging\nlabel information. Specifically, we first obtain the representative label\nembeddings for both text and speech modalities, then learn the label-enhanced\ntext/speech representations for each utterance via label-token and label-frame\ninteractions. Finally, we devise a novel label-guided attentive fusion module\nto fuse the label-aware text and speech representations for emotion\nclassification. Extensive experiments were conducted on the public IEMOCAP\ndataset, and experimental results demonstrate that our proposed approach\noutperforms existing baselines and achieves new state-of-the-art performance.\n","authors":["Peiying Wang","Sunlu Zeng","Junqing Chen","Lu Fan","Meng Chen","Youzheng Wu","Xiaodong He"],"pdf_url":"https://arxiv.org/pdf/2309.02106v1.pdf","comment":"Accepted by Interspeech 2023"},{"id":"http://arxiv.org/abs/2309.02102v1","updated":"2023-09-05T10:21:37Z","published":"2023-09-05T10:21:37Z","title":"Iterative Superquadric Recomposition of 3D Objects from Multiple Views","summary":" Humans are good at recomposing novel objects, i.e. they can identify\ncommonalities between unknown objects from general structure to finer detail,\nan ability difficult to replicate by machines. We propose a framework, ISCO, to\nrecompose an object using 3D superquadrics as semantic parts directly from 2D\nviews without training a model that uses 3D supervision. To achieve this, we\noptimize the superquadric parameters that compose a specific instance of the\nobject, comparing its rendered 3D view and 2D image silhouette. Our ISCO\nframework iteratively adds new superquadrics wherever the reconstruction error\nis high, abstracting first coarse regions and then finer details of the target\nobject. With this simple coarse-to-fine inductive bias, ISCO provides\nconsistent superquadrics for related object parts, despite not having any\nsemantic supervision. Since ISCO does not train any neural network, it is also\ninherently robust to out-of-distribution objects. Experiments show that,\ncompared to recent single instance superquadrics reconstruction approaches,\nISCO provides consistently more accurate 3D reconstructions, even from images\nin the wild. Code available at https://github.com/ExplainableML/ISCO .\n","authors":["Stephan Alaniz","Massimiliano Mancini","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2309.02102v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02094v1","updated":"2023-09-05T10:00:33Z","published":"2023-09-05T10:00:33Z","title":"TensorBank:Tensor Lakehouse for Foundation Model Training","summary":" Storing and streaming high dimensional data for foundation model training\nbecame a critical requirement with the rise of foundation models beyond natural\nlanguage. In this paper we introduce TensorBank, a petabyte scale tensor\nlakehouse capable of streaming tensors from Cloud Object Store (COS) to GPU\nmemory at wire speed based on complex relational queries. We use Hierarchical\nStatistical Indices (HSI) for query acceleration. Our architecture allows to\ndirectly address tensors on block level using HTTP range reads. Once in GPU\nmemory, data can be transformed using PyTorch transforms. We provide a generic\nPyTorch dataset type with a corresponding dataset factory translating\nrelational queries and requested transformations as an instance. By making use\nof the HSI, irrelevant blocks can be skipped without reading them as those\nindices contain statistics on their content at different hierarchical\nresolution levels. This is an opinionated architecture powered by open\nstandards and making heavy use of open-source technology. Although, hardened\nfor production use using geospatial-temporal data, this architecture\ngeneralizes to other use case like computer vision, computational neuroscience,\nbiological sequence analysis and more.\n","authors":["Romeo Kienzler","Benedikt Blumenstiel","Zoltan Arnold Nagy","S. Karthik Mukkavilli","Johannes Schmude","Marcus Freitag","Michael Behrendt","Daniel Salles Civitarese","Hendrik Hamann"],"pdf_url":"https://arxiv.org/pdf/2309.02094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05029v2","updated":"2023-09-05T09:43:02Z","published":"2023-06-08T08:29:10Z","title":"Multi-level Multiple Instance Learning with Transformer for Whole Slide\n Image Classification","summary":" Whole slide image (WSI) refers to a type of high-resolution scanned tissue\nimage, which is extensively employed in computer-assisted diagnosis (CAD). The\nextremely high resolution and limited availability of region-level annotations\nmake employing deep learning methods for WSI-based digital diagnosis\nchallenging. Recently integrating multiple instance learning (MIL) and\nTransformer for WSI analysis shows very promising results. However, designing\neffective Transformers for this weakly-supervised high-resolution image\nanalysis is an underexplored yet important problem. In this paper, we propose a\nMulti-level MIL (MMIL) scheme by introducing a hierarchical structure to MIL,\nwhich enables efficient handling of MIL tasks involving a large number of\ninstances. Based on MMIL, we instantiated MMIL-Transformer, an efficient\nTransformer model with windowed exact self-attention for large-scale MIL tasks.\nTo validate its effectiveness, we conducted a set of experiments on WSI\nclassification tasks, where MMIL-Transformer demonstrate superior performance\ncompared to existing state-of-the-art methods, i.e., 96.80% test AUC and 97.67%\ntest accuracy on the CAMELYON16 dataset, 99.04% test AUC and 94.37% test\naccuracy on the TCGA-NSCLC dataset, respectively. All code and pre-trained\nmodels are available at: https://github.com/hustvl/MMIL-Transformer\n","authors":["Ruijie Zhang","Qiaozhe Zhang","Yingzhuang Liu","Hao Xin","Yan Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2306.05029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02084v1","updated":"2023-09-05T09:42:15Z","published":"2023-09-05T09:42:15Z","title":"An Efficient Approach to Unsupervised Out-of-Distribution Detection with\n Variational Autoencoders","summary":" This paper is concerned with deep generative models (DGMs) for unsupervised\nout-of-distribution (OOD) detection. In particular, we focus on vanilla\nVariational Autoencoders (VAE) that use a standard normal prior distribution\nfor the latent variables. These models have a smaller model size, enabling\nfaster training and inference, making them well-suited for resource-limited\napplications compared to more complex DGMs. We propose a novel OOD score called\nError Reduction (ER) specifically designed for vanilla VAE. ER incorporate the\nidea of reconstructing image inputs from their lossy counterparts and takes\ninto account the Kolmogorov complexity of the images. Experimental results on\ndiverse datasets demonstrate the superiority of our approach over baseline\nmethods. Our code is available at: https://github.com/ZJLAB-AMMI/VAE4OOD.\n","authors":["Zezhen Zeng","Bin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02084v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2105.13859v4","updated":"2023-09-05T09:41:09Z","published":"2021-05-28T14:12:45Z","title":"Generative Network-Based Reduced-Order Model for Prediction, Data\n Assimilation and Uncertainty Quantification","summary":" We propose a new method in which a generative network (GN) integrate into a\nreduced-order model (ROM) framework is used to solve inverse problems for\npartial differential equations (PDE). The aim is to match available\nmeasurements and estimate the corresponding uncertainties associated with the\nstates and parameters of a numerical physical simulation. The GN is trained\nusing only unconditional simulations of the discretized PDE model. We compare\nthe proposed method with the golden standard Markov chain Monte Carlo. We apply\nthe proposed approaches to a spatio-temporal compartmental model in\nepidemiology. The results show that the proposed GN-based ROM can efficiently\nquantify uncertainty and accurately match the measurements and the golden\nstandard, using only a few unconditional simulations of the full-order\nnumerical PDE model.\n","authors":["Vinicius L. S. Silva","Claire E. Heaney","Nenko Nenov","Christopher C. Pain"],"pdf_url":"https://arxiv.org/pdf/2105.13859v4.pdf","comment":"arXiv admin note: text overlap with arXiv:2105.07729"},{"id":"http://arxiv.org/abs/2305.06044v2","updated":"2023-09-05T09:33:43Z","published":"2023-05-10T10:52:30Z","title":"Correlation visualization under missing values: a comparison between\n imputation and direct parameter estimation methods","summary":" Correlation matrix visualization is essential for understanding the\nrelationships between variables in a dataset, but missing data can pose a\nsignificant challenge in estimating correlation coefficients. In this paper, we\ncompare the effects of various missing data methods on the correlation plot,\nfocusing on two common missing patterns: random and monotone. We aim to provide\npractical strategies and recommendations for researchers and practitioners in\ncreating and analyzing the correlation plot. Our experimental results suggest\nthat while imputation is commonly used for missing data, using imputed data for\nplotting the correlation matrix may lead to a significantly misleading\ninference of the relation between the features. We recommend using DPER, a\ndirect parameter estimation approach, for plotting the correlation matrix based\non its performance in the experiments.\n","authors":["Nhat-Hao Pham","Khanh-Linh Vo","Mai Anh Vu","Thu Nguyen","Michael A. Riegler","Pål Halvorsen","Binh T. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2305.06044v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02071v1","updated":"2023-09-05T09:18:29Z","published":"2023-09-05T09:18:29Z","title":"BeeTLe: A Framework for Linear B-Cell Epitope Prediction and\n Classification","summary":" The process of identifying and characterizing B-cell epitopes, which are the\nportions of antigens recognized by antibodies, is important for our\nunderstanding of the immune system, and for many applications including vaccine\ndevelopment, therapeutics, and diagnostics. Computational epitope prediction is\nchallenging yet rewarding as it significantly reduces the time and cost of\nlaboratory work. Most of the existing tools do not have satisfactory\nperformance and only discriminate epitopes from non-epitopes. This paper\npresents a new deep learning-based multi-task framework for linear B-cell\nepitope prediction as well as antibody type-specific epitope classification.\nSpecifically, a sequenced-based neural network model using recurrent layers and\nTransformer blocks is developed. We propose an amino acid encoding method based\non eigen decomposition to help the model learn the representations of epitopes.\nWe introduce modifications to standard cross-entropy loss functions by\nextending a logit adjustment technique to cope with the class imbalance.\nExperimental results on data curated from the largest public epitope database\ndemonstrate the validity of the proposed methods and the superior performance\ncompared to competing ones.\n","authors":["Xiao Yuan"],"pdf_url":"https://arxiv.org/pdf/2309.02071v1.pdf","comment":"18 pages, 3 figures, accepted at ECML PKDD 2023"},{"id":"http://arxiv.org/abs/2309.02065v1","updated":"2023-09-05T09:07:24Z","published":"2023-09-05T09:07:24Z","title":"Efficiency is Not Enough: A Critical Perspective of Environmentally\n Sustainable AI","summary":" Artificial Intelligence (AI) is currently spearheaded by machine learning\n(ML) methods such as deep learning (DL) which have accelerated progress on many\ntasks thought to be out of reach of AI. These ML methods can often be compute\nhungry, energy intensive, and result in significant carbon emissions, a known\ndriver of anthropogenic climate change. Additionally, the platforms on which ML\nsystems run are associated with environmental impacts including and beyond\ncarbon emissions. The solution lionized by both industry and the ML community\nto improve the environmental sustainability of ML is to increase the efficiency\nwith which ML systems operate in terms of both compute and energy consumption.\nIn this perspective, we argue that efficiency alone is not enough to make ML as\na technology environmentally sustainable. We do so by presenting three high\nlevel discrepancies between the effect of efficiency on the environmental\nsustainability of ML when considering the many variables which it interacts\nwith. In doing so, we comprehensively demonstrate, at multiple levels of\ngranularity both technical and non-technical reasons, why efficiency is not\nenough to fully remedy the environmental impacts of ML. Based on this, we\npresent and argue for systems thinking as a viable path towards improving the\nenvironmental sustainability of ML holistically.\n","authors":["Dustin Wright","Christian Igel","Gabrielle Samuel","Raghavendra Selvan"],"pdf_url":"https://arxiv.org/pdf/2309.02065v1.pdf","comment":"24 pages; 6 figures"},{"id":"http://arxiv.org/abs/2309.02064v1","updated":"2023-09-05T09:06:34Z","published":"2023-09-05T09:06:34Z","title":"MvFS: Multi-view Feature Selection for Recommender System","summary":" Feature selection, which is a technique to select key features in recommender\nsystems, has received increasing research attention. Recently, Adaptive Feature\nSelection (AdaFS) has shown remarkable performance by adaptively selecting\nfeatures for each data instance, considering that the importance of a given\nfeature field can vary significantly across data. However, this method still\nhas limitations in that its selection process could be easily biased to major\nfeatures that frequently occur. To address these problems, we propose\nMulti-view Feature Selection (MvFS), which selects informative features for\neach instance more effectively. Most importantly, MvFS employs a multi-view\nnetwork consisting of multiple sub-networks, each of which learns to measure\nthe feature importance of a part of data with different feature patterns. By\ndoing so, MvFS promotes a more balanced feature selection process mitigating\nthe bias problem towards dominant patterns. Moreover, MvFS adopts an effective\nimportance score modeling strategy which is applied independently to each field\nwithout incurring dependency among features. Experimental results on real-world\ndatasets demonstrate the effectiveness of MvFS compared to state-of-the-art\nbaselines.\n","authors":["Youngjune Lee","Yeongjong Jeong","Keunchan Park","SeongKu Kang"],"pdf_url":"https://arxiv.org/pdf/2309.02064v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2109.03459v2","updated":"2023-09-05T09:06:10Z","published":"2021-09-08T07:00:45Z","title":"Dual Correction Strategy for Ranking Distillation in Top-N Recommender\n System","summary":" Knowledge Distillation (KD), which transfers the knowledge of a well-trained\nlarge model (teacher) to a small model (student), has become an important area\nof research for practical deployment of recommender systems. Recently, Relaxed\nRanking Distillation (RRD) has shown that distilling the ranking information in\nthe recommendation list significantly improves the performance. However, the\nmethod still has limitations in that 1) it does not fully utilize the\nprediction errors of the student model, which makes the training not fully\nefficient, and 2) it only distills the user-side ranking information, which\nprovides an insufficient view under the sparse implicit feedback. This paper\npresents Dual Correction strategy for Distillation (DCD), which transfers the\nranking information from the teacher model to the student model in a more\nefficient manner. Most importantly, DCD uses the discrepancy between the\nteacher model and the student model predictions to decide which knowledge to be\ndistilled. By doing so, DCD essentially provides the learning guidance tailored\nto \"correcting\" what the student model has failed to accurately predict. This\nprocess is applied for transferring the ranking information from the user-side\nas well as the item-side to address sparse implicit user feedback. Our\nexperiments show that the proposed method outperforms the state-of-the-art\nbaselines, and ablation studies validate the effectiveness of each component.\n","authors":["Youngjune Lee","Kee-Eung Kim"],"pdf_url":"https://arxiv.org/pdf/2109.03459v2.pdf","comment":"CIKM 2021"},{"id":"http://arxiv.org/abs/2309.02055v1","updated":"2023-09-05T08:57:35Z","published":"2023-09-05T08:57:35Z","title":"No-Regret Caching with Noisy Request Estimates","summary":" Online learning algorithms have been successfully used to design caching\npolicies with regret guarantees. Existing algorithms assume that the cache\nknows the exact request sequence, but this may not be feasible in high load\nand/or memory-constrained scenarios, where the cache may have access only to\nsampled requests or to approximate requests' counters. In this paper, we\npropose the Noisy-Follow-the-Perturbed-Leader (NFPL) algorithm, a variant of\nthe classic Follow-the-Perturbed-Leader (FPL) when request estimates are noisy,\nand we show that the proposed solution has sublinear regret under specific\nconditions on the requests estimator. The experimental evaluation compares the\nproposed solution against classic caching policies and validates the proposed\napproach under both synthetic and real request traces.\n","authors":["Younes Ben Mazziane","Francescomaria Faticanti","Giovanni Neglia","Sara Alouf"],"pdf_url":"https://arxiv.org/pdf/2309.02055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02050v1","updated":"2023-09-05T08:51:40Z","published":"2023-09-05T08:51:40Z","title":"Model-agnostic network inference enhancement from noisy measurements via\n curriculum learning","summary":" Noise is a pervasive element within real-world measurement data,\nsignificantly undermining the performance of network inference models. However,\nthe quest for a comprehensive enhancement framework capable of bolstering noise\nresistance across a diverse array of network inference models has remained\nelusive. Here, we present an elegant and efficient framework tailored to\namplify the capabilities of network inference models in the presence of noise.\nLeveraging curriculum learning, we mitigate the deleterious impact of noisy\nsamples on network inference models. Our proposed framework is model-agnostic,\nseamlessly integrable into a plethora of model-based and model-free network\ninference methods. Notably, we utilize one model-based and three model-free\nnetwork inference methods as the foundation. Extensive experimentation across\nvarious synthetic and real-world networks, encapsulating diverse nonlinear\ndynamic processes, showcases substantial performance augmentation under varied\nnoise types, particularly thriving in scenarios enriched with clean samples.\nThis framework's adeptness in fortifying both model-free and model-based\nnetwork inference methodologies paves the avenue towards a comprehensive and\nunified enhancement framework, encompassing the entire spectrum of network\ninference models. Available Code: https://github.com/xiaoyuans/MANIE.\n","authors":["Kai Wu","Yuanyuan Li","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02048v1","updated":"2023-09-05T08:48:25Z","published":"2023-09-05T08:48:25Z","title":"Probabilistic Self-supervised Learning via Scoring Rules Minimization","summary":" In this paper, we propose a novel probabilistic self-supervised learning via\nScoring Rule Minimization (ProSMIN), which leverages the power of probabilistic\nmodels to enhance representation quality and mitigate collapsing\nrepresentations. Our proposed approach involves two neural networks; the online\nnetwork and the target network, which collaborate and learn the diverse\ndistribution of representations from each other through knowledge distillation.\nBy presenting the input samples in two augmented formats, the online network is\ntrained to predict the target network representation of the same sample under a\ndifferent augmented view. The two networks are trained via our new loss\nfunction based on proper scoring rules. We provide a theoretical justification\nfor ProSMIN's convergence, demonstrating the strict propriety of its modified\nscoring rule. This insight validates the method's optimization process and\ncontributes to its robustness and effectiveness in improving representation\nquality. We evaluate our probabilistic model on various downstream tasks, such\nas in-distribution generalization, out-of-distribution detection, dataset\ncorruption, low-shot learning, and transfer learning. Our method achieves\nsuperior accuracy and calibration, surpassing the self-supervised baseline in a\nwide range of experiments on large-scale datasets like ImageNet-O and\nImageNet-C, ProSMIN demonstrates its scalability and real-world applicability.\n","authors":["Amirhossein Vahidi","Simon Schoßer","Lisa Wimmer","Yawei Li","Bernd Bischl","Eyke Hüllermeier","Mina Rezaei"],"pdf_url":"https://arxiv.org/pdf/2309.02048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16535v2","updated":"2023-09-05T08:45:28Z","published":"2023-03-29T08:51:28Z","title":"Nonlinear Independent Component Analysis for Principled Disentanglement\n in Unsupervised Deep Learning","summary":" A central problem in unsupervised deep learning is how to find useful\nrepresentations of high-dimensional data, sometimes called \"disentanglement\".\nMost approaches are heuristic and lack a proper theoretical foundation. In\nlinear representation learning, independent component analysis (ICA) has been\nsuccessful in many applications areas, and it is principled, i.e., based on a\nwell-defined probabilistic model. However, extension of ICA to the nonlinear\ncase has been problematic due to the lack of identifiability, i.e., uniqueness\nof the representation. Recently, nonlinear extensions that utilize temporal\nstructure or some auxiliary information have been proposed. Such models are in\nfact identifiable, and consequently, an increasing number of algorithms have\nbeen developed. In particular, some self-supervised algorithms can be shown to\nestimate nonlinear ICA, even though they have initially been proposed from\nheuristic perspectives. This paper reviews the state-of-the-art of nonlinear\nICA theory and algorithms.\n","authors":["Aapo Hyvarinen","Ilyes Khemakhem","Hiroshi Morioka"],"pdf_url":"https://arxiv.org/pdf/2303.16535v2.pdf","comment":"Revised version, to appear in Patterns"},{"id":"http://arxiv.org/abs/2309.02045v1","updated":"2023-09-05T08:44:23Z","published":"2023-09-05T08:44:23Z","title":"Enhance Multi-domain Sentiment Analysis of Review Texts through\n Prompting Strategies","summary":" Large Language Models (LLMs) have made significant strides in both scientific\nresearch and practical applications. Existing studies have demonstrated the\nstate-of-the-art (SOTA) performance of LLMs in various natural language\nprocessing tasks. However, the question of how to further enhance LLMs'\nperformance in specific task using prompting strategies remains a pivotal\nconcern. This paper explores the enhancement of LLMs' performance in sentiment\nanalysis through the application of prompting strategies. We formulate the\nprocess of prompting for sentiment analysis tasks and introduce two novel\nstrategies tailored for sentiment analysis: RolePlaying (RP) prompting and\nChain-of-thought (CoT) prompting. Specifically, we also propose the RP-CoT\nprompting strategy which is a combination of RP prompting and CoT prompting. We\nconduct comparative experiments on three distinct domain datasets to evaluate\nthe effectiveness of the proposed sentiment analysis strategies. The results\ndemonstrate that the adoption of the proposed prompting strategies leads to a\nincreasing enhancement in sentiment analysis accuracy. Further, the CoT\nprompting strategy exhibits a notable impact on implicit sentiment analysis,\nwith the RP-CoT prompting strategy delivering the most superior performance\namong all strategies.\n","authors":["Yajing Wang","Zongwei Luo"],"pdf_url":"https://arxiv.org/pdf/2309.02045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16781v2","updated":"2023-09-05T08:37:14Z","published":"2023-08-31T14:59:32Z","title":"StratMed: Relevance Stratification for Low-resource Medication\n Recommendation","summary":" With the growing imbalance between limited medical resources and escalating\ndemands, AI-based clinical tasks have become paramount. Medication\nrecommendation, as a sub-domain, aims to amalgamate longitudinal patient\nhistory with medical knowledge, assisting physicians in prescribing safer and\nmore accurate medication combinations. Existing methods overlook the inherent\nlong-tail distribution in medical data, lacking balanced representation between\nhead and tail data, which leads to sub-optimal model performance. To address\nthis challenge, we introduce StratMed, a model that incorporates an innovative\nrelevance stratification mechanism. It harmonizes discrepancies in data\nlong-tail distribution and strikes a balance between the safety and accuracy of\nmedication combinations. Specifically, we first construct a pre-training method\nusing deep learning networks to obtain entity representation. After that, we\ndesign a pyramid-like data stratification method to obtain more generalized\nentity relationships by reinforcing the features of unpopular entities. Based\non this relationship, we designed two graph structures to express medication\nprecision and safety at the same level to obtain visit representations.\nFinally, the patient's historical clinical information is fitted to generate\nmedication combinations for the current health condition. Experiments on the\nMIMIC-III dataset demonstrate that our method has outperformed current\nstate-of-the-art methods in four evaluation metrics (including safety and\naccuracy).\n","authors":["Xiang Li","Shunpan Liang","Tengfei Ma","Yulei Hou"],"pdf_url":"https://arxiv.org/pdf/2308.16781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02040v1","updated":"2023-09-05T08:32:07Z","published":"2023-09-05T08:32:07Z","title":"Diffusion Generative Inverse Design","summary":" Inverse design refers to the problem of optimizing the input of an objective\nfunction in order to enact a target outcome. For many real-world engineering\nproblems, the objective function takes the form of a simulator that predicts\nhow the system state will evolve over time, and the design challenge is to\noptimize the initial conditions that lead to a target outcome. Recent\ndevelopments in learned simulation have shown that graph neural networks (GNNs)\ncan be used for accurate, efficient, differentiable estimation of simulator\ndynamics, and support high-quality design optimization with gradient- or\nsampling-based optimization procedures. However, optimizing designs from\nscratch requires many expensive model queries, and these procedures exhibit\nbasic failures on either non-convex or high-dimensional problems.In this work,\nwe show how denoising diffusion models (DDMs) can be used to solve inverse\ndesign problems efficiently and propose a particle sampling algorithm for\nfurther improving their efficiency. We perform experiments on a number of fluid\ndynamics design challenges, and find that our approach substantially reduces\nthe number of calls to the simulator compared to standard techniques.\n","authors":["Marin Vlastelica","Tatiana López-Guevara","Kelsey Allen","Peter Battaglia","Arnaud Doucet","Kimberley Stachenfeld"],"pdf_url":"https://arxiv.org/pdf/2309.02040v1.pdf","comment":"ICML workshop on Structured Probabilistic Inference & Generative\n Modeling"},{"id":"http://arxiv.org/abs/2309.02033v1","updated":"2023-09-05T08:22:07Z","published":"2023-09-05T08:22:07Z","title":"Data-Juicer: A One-Stop Data Processing System for Large Language Models","summary":" The immense evolution in Large Language Models (LLMs) has underscored the\nimportance of massive, diverse, and high-quality data. Despite this, existing\nopen-source tools for LLM data processing remain limited and mostly tailored to\nspecific datasets, with an emphasis on the reproducibility of released data\nover adaptability and usability, inhibiting potential applications. In\nresponse, we propose a one-stop, powerful yet flexible and user-friendly LLM\ndata processing system named Data-Juicer. Our system offers over 50 built-in\nversatile operators and pluggable tools, which synergize modularity,\ncomposability, and extensibility dedicated to diverse LLM data processing\nneeds. By incorporating visualized and automatic evaluation capabilities,\nData-Juicer enables a timely feedback loop to accelerate data processing and\ngain data insights. To enhance usability, Data-Juicer provides out-of-the-box\ncomponents for users with various backgrounds, and fruitful data recipes for\nLLM pre-training and post-tuning usages. Further, we employ multi-facet system\noptimization and seamlessly integrate Data-Juicer with both LLM and distributed\ncomputing ecosystems, to enable efficient and scalable data processing.\nEmpirical validation of the generated data recipes reveals considerable\nimprovements in LLaMA performance for various pre-training and post-tuning\ncases, demonstrating up to 7.45% relative improvement of averaged score across\n16 LLM benchmarks and 16.25% higher win rate using pair-wise GPT-4 evaluation.\nThe system's efficiency and scalability are also validated, supported by up to\n88.7% reduction in single-machine processing time, 77.1% and 73.1% less memory\nand CPU usage respectively, and 7.91x processing acceleration when utilizing\ndistributed computing ecosystems. Our system, data recipes, and multiple\ntutorial demos are released, calling for broader research centered on LLM data.\n","authors":["Daoyuan Chen","Yilun Huang","Zhijian Ma","Hesen Chen","Xuchen Pan","Ce Ge","Dawei Gao","Yuexiang Xie","Zhaoyang Liu","Jinyang Gao","Yaliang Li","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.02033v1.pdf","comment":"Under continuous maintenance and updating; The system, refined data\n recipes, and demos are at https://github.com/alibaba/data-juicer"},{"id":"http://arxiv.org/abs/2309.02028v1","updated":"2023-09-05T08:14:25Z","published":"2023-09-05T08:14:25Z","title":"Non-Parametric Representation Learning with Kernels","summary":" Unsupervised and self-supervised representation learning has become popular\nin recent years for learning useful features from unlabelled data.\nRepresentation learning has been mostly developed in the neural network\nliterature, and other models for representation learning are surprisingly\nunexplored. In this work, we introduce and analyze several kernel-based\nrepresentation learning approaches: Firstly, we define two kernel\nSelf-Supervised Learning (SSL) models using contrastive loss functions and\nsecondly, a Kernel Autoencoder (AE) model based on the idea of embedding and\nreconstructing data. We argue that the classical representer theorems for\nsupervised kernel machines are not always applicable for (self-supervised)\nrepresentation learning, and present new representer theorems, which show that\nthe representations learned by our kernel models can be expressed in terms of\nkernel matrices. We further derive generalisation error bounds for\nrepresentation learning with kernel SSL and AE, and empirically evaluate the\nperformance of these methods in both small data regimes as well as in\ncomparison with neural network based models.\n","authors":["Pascal Esser","Maximilian Fleissner","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2309.02028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02027v1","updated":"2023-09-05T08:13:34Z","published":"2023-09-05T08:13:34Z","title":"Granger Causal Inference in Multivariate Hawkes Processes by Minimum\n Message Length","summary":" Multivariate Hawkes processes (MHPs) are versatile probabilistic tools used\nto model various real-life phenomena: earthquakes, operations on stock markets,\nneuronal activity, virus propagation and many others. In this paper, we focus\non MHPs with exponential decay kernels and estimate connectivity graphs, which\nrepresent the Granger causal relations between their components. We approach\nthis inference problem by proposing an optimization criterion and model\nselection algorithm based on the minimum message length (MML) principle. MML\ncompares Granger causal models using the Occam's razor principle in the\nfollowing way: even when models have a comparable goodness-of-fit to the\nobserved data, the one generating the most concise explanation of the data is\npreferred. While most of the state-of-art methods using lasso-type penalization\ntend to overfitting in scenarios with short time horizons, the proposed\nMML-based method achieves high F1 scores in these settings. We conduct a\nnumerical study comparing the proposed algorithm to other related classical and\nstate-of-art methods, where we achieve the highest F1 scores in specific sparse\ngraph settings. We illustrate the proposed method also on G7 sovereign bond\ndata and obtain causal connections, which are in agreement with the expert\nknowledge available in the literature.\n","authors":["Katerina Hlavackova-Schindler","Anna Melnykova","Irene Tubikanec"],"pdf_url":"https://arxiv.org/pdf/2309.02027v1.pdf","comment":"23 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.02025v1","updated":"2023-09-05T08:03:59Z","published":"2023-09-05T08:03:59Z","title":"RDGSL: Dynamic Graph Representation Learning with Structure Learning","summary":" Temporal Graph Networks (TGNs) have shown remarkable performance in learning\nrepresentation for continuous-time dynamic graphs. However, real-world dynamic\ngraphs typically contain diverse and intricate noise. Noise can significantly\ndegrade the quality of representation generation, impeding the effectiveness of\nTGNs in downstream tasks. Though structure learning is widely applied to\nmitigate noise in static graphs, its adaptation to dynamic graph settings poses\ntwo significant challenges. i) Noise dynamics. Existing structure learning\nmethods are ill-equipped to address the temporal aspect of noise, hampering\ntheir effectiveness in such dynamic and ever-changing noise patterns. ii) More\nsevere noise. Noise may be introduced along with multiple interactions between\ntwo nodes, leading to the re-pollution of these nodes and consequently causing\nmore severe noise compared to static graphs. In this paper, we present RDGSL, a\nrepresentation learning method in continuous-time dynamic graphs. Meanwhile, we\npropose dynamic graph structure learning, a novel supervisory signal that\nempowers RDGSL with the ability to effectively combat noise in dynamic graphs.\nTo address the noise dynamics issue, we introduce the Dynamic Graph Filter,\nwhere we innovatively propose a dynamic noise function that dynamically\ncaptures both current and historical noise, enabling us to assess the temporal\naspect of noise and generate a denoised graph. We further propose the Temporal\nEmbedding Learner to tackle the challenge of more severe noise, which utilizes\nan attention mechanism to selectively turn a blind eye to noisy edges and hence\nfocus on normal edges, enhancing the expressiveness for representation\ngeneration that remains resilient to noise. Our method demonstrates robustness\ntowards downstream tasks, resulting in up to 5.1% absolute AUC improvement in\nevolving classification versus the second-best baseline.\n","authors":["Siwei Zhang","Yun Xiong","Yao Zhang","Yiheng Sun","Xi Chen","Yizhu Jiao","Yangyong Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.02025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02022v1","updated":"2023-09-05T08:00:01Z","published":"2023-09-05T08:00:01Z","title":"Dynamic Early Exiting Predictive Coding Neural Networks","summary":" Internet of Things (IoT) sensors are nowadays heavily utilized in various\nreal-world applications ranging from wearables to smart buildings passing by\nagrotechnology and health monitoring. With the huge amounts of data generated\nby these tiny devices, Deep Learning (DL) models have been extensively used to\nenhance them with intelligent processing. However, with the urge for smaller\nand more accurate devices, DL models became too heavy to deploy. It is thus\nnecessary to incorporate the hardware's limited resources in the design\nprocess. Therefore, inspired by the human brain known for its efficiency and\nlow power consumption, we propose a shallow bidirectional network based on\npredictive coding theory and dynamic early exiting for halting further\ncomputations when a performance threshold is surpassed. We achieve comparable\naccuracy to VGG-16 in image classification on CIFAR-10 with fewer parameters\nand less computational complexity.\n","authors":["Alaa Zniber","Ouassim Karrakchou","Mounir Ghogho"],"pdf_url":"https://arxiv.org/pdf/2309.02022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02014v1","updated":"2023-09-05T07:49:10Z","published":"2023-09-05T07:49:10Z","title":"PROMISE: Preconditioned Stochastic Optimization Methods by Incorporating\n Scalable Curvature Estimates","summary":" This paper introduces PROMISE ($\\textbf{Pr}$econditioned Stochastic\n$\\textbf{O}$ptimization $\\textbf{M}$ethods by $\\textbf{I}$ncorporating\n$\\textbf{S}$calable Curvature $\\textbf{E}$stimates), a suite of sketching-based\npreconditioned stochastic gradient algorithms for solving large-scale convex\noptimization problems arising in machine learning. PROMISE includes\npreconditioned versions of SVRG, SAGA, and Katyusha; each algorithm comes with\na strong theoretical analysis and effective default hyperparameter values. In\ncontrast, traditional stochastic gradient methods require careful\nhyperparameter tuning to succeed, and degrade in the presence of\nill-conditioning, a ubiquitous phenomenon in machine learning. Empirically, we\nverify the superiority of the proposed algorithms by showing that, using\ndefault hyperparameter values, they outperform or match popular tuned\nstochastic gradient optimizers on a test bed of $51$ ridge and logistic\nregression problems assembled from benchmark machine learning repositories. On\nthe theoretical side, this paper introduces the notion of quadratic regularity\nin order to establish linear convergence of all proposed methods even when the\npreconditioner is updated infrequently. The speed of linear convergence is\ndetermined by the quadratic regularity ratio, which often provides a tighter\nbound on the convergence rate compared to the condition number, both in theory\nand in practice, and explains the fast global linear convergence of the\nproposed methods.\n","authors":["Zachary Frangella","Pratik Rathore","Shipu Zhao","Madeleine Udell"],"pdf_url":"https://arxiv.org/pdf/2309.02014v1.pdf","comment":"127 pages, 31 Figures"},{"id":"http://arxiv.org/abs/2309.02012v1","updated":"2023-09-05T07:48:52Z","published":"2023-09-05T07:48:52Z","title":"iLoRE: Dynamic Graph Representation with Instant Long-term Modeling and\n Re-occurrence Preservation","summary":" Continuous-time dynamic graph modeling is a crucial task for many real-world\napplications, such as financial risk management and fraud detection. Though\nexisting dynamic graph modeling methods have achieved satisfactory results,\nthey still suffer from three key limitations, hindering their scalability and\nfurther applicability. i) Indiscriminate updating. For incoming edges, existing\nmethods would indiscriminately deal with them, which may lead to more time\nconsumption and unexpected noisy information. ii) Ineffective node-wise\nlong-term modeling. They heavily rely on recurrent neural networks (RNNs) as a\nbackbone, which has been demonstrated to be incapable of fully capturing\nnode-wise long-term dependencies in event sequences. iii) Neglect of\nre-occurrence patterns. Dynamic graphs involve the repeated occurrence of\nneighbors that indicates their importance, which is disappointedly neglected by\nexisting methods. In this paper, we present iLoRE, a novel dynamic graph\nmodeling method with instant node-wise Long-term modeling and Re-occurrence\npreservation. To overcome the indiscriminate updating issue, we introduce the\nAdaptive Short-term Updater module that will automatically discard the useless\nor noisy edges, ensuring iLoRE's effectiveness and instant ability. We further\npropose the Long-term Updater to realize more effective node-wise long-term\nmodeling, where we innovatively propose the Identity Attention mechanism to\nempower a Transformer-based updater, bypassing the limited effectiveness of\ntypical RNN-dominated designs. Finally, the crucial re-occurrence patterns are\nalso encoded into a graph module for informative representation learning, which\nwill further improve the expressiveness of our method. Our experimental results\non real-world datasets demonstrate the effectiveness of our iLoRE for dynamic\ngraph modeling.\n","authors":["Siwei Zhang","Yun Xiong","Yao Zhang","Xixi Wu","Yiheng Sun","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.02012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02011v1","updated":"2023-09-05T07:48:45Z","published":"2023-09-05T07:48:45Z","title":"Representation Learning Dynamics of Self-Supervised Models","summary":" Self-Supervised Learning (SSL) is an important paradigm for learning\nrepresentations from unlabelled data, and SSL with neural networks has been\nhighly successful in practice. However current theoretical analysis of SSL is\nmostly restricted to generalisation error bounds. In contrast, learning\ndynamics often provide a precise characterisation of the behaviour of neural\nnetworks based models but, so far, are mainly known in supervised settings. In\nthis paper, we study the learning dynamics of SSL models, specifically\nrepresentations obtained by minimising contrastive and non-contrastive losses.\nWe show that a naive extension of the dymanics of multivariate regression to\nSSL leads to learning trivial scalar representations that demonstrates\ndimension collapse in SSL. Consequently, we formulate SSL objectives with\northogonality constraints on the weights, and derive the exact (network width\nindependent) learning dynamics of the SSL models trained using gradient descent\non the Grassmannian manifold. We also argue that the infinite width\napproximation of SSL models significantly deviate from the neural tangent\nkernel approximations of supervised models. We numerically illustrate the\nvalidity of our theoretical findings, and discuss how the presented results\nprovide a framework for further theoretical analysis of contrastive and\nnon-contrastive SSL.\n","authors":["Pascal Esser","Satyaki Mukherjee","Debarghya Ghoshdastidar"],"pdf_url":"https://arxiv.org/pdf/2309.02011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02010v1","updated":"2023-09-05T07:47:43Z","published":"2023-09-05T07:47:43Z","title":"Establishing a real-time traffic alarm in the city of Valencia with Deep\n Learning","summary":" Urban traffic emissions represent a significant concern due to their\ndetrimental impacts on both public health and the environment. Consequently,\ndecision-makers have flagged their reduction as a crucial goal. In this study,\nwe first analyze the correlation between traffic flux and pollution in the city\nof Valencia, Spain. Our results demonstrate that traffic has a significant\nimpact on the levels of certain pollutants (especially $\\text{NO}_\\text{x}$).\nSecondly, we develop an alarm system to predict if a street is likely to\nexperience unusually high traffic in the next 30 minutes, using an independent\nthree-tier level for each street. To make the predictions, we use traffic data\nupdated every 10 minutes and Long Short-Term Memory (LSTM) neural networks. We\ntrained the LSTM using traffic data from 2018, and tested it using traffic data\nfrom 2019.\n","authors":["Miguel Folgado","Veronica Sanz","Johannes Hirn","Edgar Lorenzo-Saez","Javier Urchueguia"],"pdf_url":"https://arxiv.org/pdf/2309.02010v1.pdf","comment":"12 pages, 13 figures"},{"id":"http://arxiv.org/abs/2309.02005v1","updated":"2023-09-05T07:39:19Z","published":"2023-09-05T07:39:19Z","title":"Aggregating Correlated Estimations with (Almost) no Training","summary":" Many decision problems cannot be solved exactly and use several estimation\nalgorithms that assign scores to the different available options. The\nestimation errors can have various correlations, from low (e.g. between two\nvery different approaches) to high (e.g. when using a given algorithm with\ndifferent hyperparameters). Most aggregation rules would suffer from this\ndiversity of correlations. In this article, we propose different aggregation\nrules that take correlations into account, and we compare them to naive rules\nin various experiments based on synthetic data. Our results show that when\nsufficient information is known about the correlations between errors, a\nmaximum likelihood aggregation should be preferred. Otherwise, typically with\nlimited training data, we recommend a method that we call Embedded Voting (EV).\n","authors":["Theo Delemazure","François Durand","Fabien Mathieu"],"pdf_url":"https://arxiv.org/pdf/2309.02005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02001v1","updated":"2023-09-05T07:31:22Z","published":"2023-09-05T07:31:22Z","title":"Analyzing domain shift when using additional data for the MICCAI KiTS23\n Challenge","summary":" Using additional training data is known to improve the results, especially\nfor medical image 3D segmentation where there is a lack of training material\nand the model needs to generalize well from few available data. However, the\nnew data could have been acquired using other instruments and preprocessed such\nits distribution is significantly different from the original training data.\nTherefore, we study techniques which ameliorate domain shift during training so\nthat the additional data becomes better usable for preprocessing and training\ntogether with the original data. Our results show that transforming the\nadditional data using histogram matching has better results than using simple\nnormalization.\n","authors":["George Stoica","Mihaela Breaban","Vlad Barbu"],"pdf_url":"https://arxiv.org/pdf/2309.02001v1.pdf","comment":"This preprint has not undergone peer review or any post-submission\n improvements or corrections. The Version of Record of this contribution is\n published in [TODO], and is available online at https://doi.org/[TODO]"},{"id":"http://arxiv.org/abs/2305.04891v3","updated":"2023-09-05T07:24:00Z","published":"2023-05-03T12:34:45Z","title":"DELTA: Dynamic Embedding Learning with Truncated Conscious Attention for\n CTR Prediction","summary":" Click-Through Rate (CTR) prediction is a pivotal task in product and content\nrecommendation, where learning effective feature embeddings is of great\nsignificance. However, traditional methods typically learn fixed feature\nrepresentations without dynamically refining feature representations according\nto the context information, leading to suboptimal performance. Some recent\napproaches attempt to address this issue by learning bit-wise weights or\naugmented embeddings for feature representations, but suffer from uninformative\nor redundant features in the context. To tackle this problem, inspired by the\nGlobal Workspace Theory in conscious processing, which posits that only a\nspecific subset of the product features are pertinent while the rest can be\nnoisy and even detrimental to human-click behaviors, we propose a CTR model\nthat enables Dynamic Embedding Learning with Truncated Conscious Attention for\nCTR prediction, termed DELTA. DELTA contains two key components: (I) conscious\ntruncation module (CTM), which utilizes curriculum learning to apply adaptive\ntruncation on attention weights to select the most critical feature in the\ncontext; (II) explicit embedding optimization (EEO), which applies an auxiliary\ntask during training that directly and independently propagates the gradient\nfrom the loss layer to the embedding layer, thereby optimizing the embedding\nexplicitly via linear feature crossing. Extensive experiments on five\nchallenging CTR datasets demonstrate that DELTA achieves new state-of-art\nperformance among current CTR methods.\n","authors":["Chen Zhu","Liang Du","Hong Chen","Shuang Zhao","Zixun Sun","Xin Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.04891v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01988v1","updated":"2023-09-05T06:51:39Z","published":"2023-09-05T06:51:39Z","title":"sasdim: self-adaptive noise scaling diffusion model for spatial time\n series imputation","summary":" Spatial time series imputation is critically important to many real\napplications such as intelligent transportation and air quality monitoring.\nAlthough recent transformer and diffusion model based approaches have achieved\nsignificant performance gains compared with conventional statistic based\nmethods, spatial time series imputation still remains as a challenging issue\ndue to the complex spatio-temporal dependencies and the noise uncertainty of\nthe spatial time series data. Especially, recent diffusion process based models\nmay introduce random noise to the imputations, and thus cause negative impact\non the model performance. To this end, we propose a self-adaptive noise scaling\ndiffusion model named SaSDim to more effectively perform spatial time series\nimputation. Specially, we propose a new loss function that can scale the noise\nto the similar intensity, and propose the across spatial-temporal global\nconvolution module to more effectively capture the dynamic spatial-temporal\ndependencies. Extensive experiments conducted on three real world datasets\nverify the effectiveness of SaSDim by comparison with current state-of-the-art\nbaselines.\n","authors":["Shunyang Zhang","Senzhang Wang","Xianzhen Tan","Ruochen Liu","Jian Zhang","Jianxin Wang"],"pdf_url":"https://arxiv.org/pdf/2309.01988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12714v2","updated":"2023-09-05T06:20:11Z","published":"2023-06-22T07:47:18Z","title":"Toward Leveraging Pre-Trained Self-Supervised Frontends for Automatic\n Singing Voice Understanding Tasks: Three Case Studies","summary":" Automatic singing voice understanding tasks, such as singer identification,\nsinging voice transcription, and singing technique classification, benefit from\ndata-driven approaches that utilize deep learning techniques. These approaches\nwork well even under the rich diversity of vocal and noisy samples owing to\ntheir representation ability. However, the limited availability of labeled data\nremains a significant obstacle to achieving satisfactory performance. In recent\nyears, self-supervised learning models (SSL models) have been trained using\nlarge amounts of unlabeled data in the field of speech processing and music\nclassification. By fine-tuning these models for the target tasks, comparable\nperformance to conventional supervised learning can be achieved with limited\ntraining data. Therefore, in this paper, we investigate the effectiveness of\nSSL models for various singing voice recognition tasks. We report the results\nof experiments comparing SSL models for three different tasks (i.e., singer\nidentification, singing voice transcription, and singing technique\nclassification) as initial exploration and aim to discuss these findings.\nExperimental results show that each SSL model achieves comparable performance\nand sometimes outperforms compared to state-of-the-art methods on each task. We\nalso conducted a layer-wise analysis to further understand the behavior of the\nSSL models.\n","authors":["Yuya Yamamoto"],"pdf_url":"https://arxiv.org/pdf/2306.12714v2.pdf","comment":"Accepted at APSIPA ASC 2023"},{"id":"http://arxiv.org/abs/2309.01978v1","updated":"2023-09-05T06:13:09Z","published":"2023-09-05T06:13:09Z","title":"An LSTM-Based Predictive Monitoring Method for Data with Time-varying\n Variability","summary":" The recurrent neural network and its variants have shown great success in\nprocessing sequences in recent years. However, this deep neural network has not\naroused much attention in anomaly detection through predictively process\nmonitoring. Furthermore, the traditional statistic models work on assumptions\nand hypothesis tests, while neural network (NN) models do not need that many\nassumptions. This flexibility enables NN models to work efficiently on data\nwith time-varying variability, a common inherent aspect of data in practice.\nThis paper explores the ability of the recurrent neural network structure to\nmonitor processes and proposes a control chart based on long short-term memory\n(LSTM) prediction intervals for data with time-varying variability. The\nsimulation studies provide empirical evidence that the proposed model\noutperforms other NN-based predictive monitoring methods for mean shift\ndetection. The proposed method is also applied to time series sensor data,\nwhich confirms that the proposed method is an effective technique for detecting\nabnormalities.\n","authors":["Jiaqi Qiu","Yu Lin","Inez Zwetsloot"],"pdf_url":"https://arxiv.org/pdf/2309.01978v1.pdf","comment":"19 pages, 9 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.01973v1","updated":"2023-09-05T05:58:23Z","published":"2023-09-05T05:58:23Z","title":"Linear Regression using Heterogeneous Data Batches","summary":" In many learning applications, data are collected from multiple sources, each\nproviding a \\emph{batch} of samples that by itself is insufficient to learn its\ninput-output relationship. A common approach assumes that the sources fall in\none of several unknown subgroups, each with an unknown input distribution and\ninput-output relationship. We consider one of this setup's most fundamental and\nimportant manifestations where the output is a noisy linear combination of the\ninputs, and there are $k$ subgroups, each with its own regression vector. Prior\nwork~\\cite{kong2020meta} showed that with abundant small-batches, the\nregression vectors can be learned with only few, $\\tilde\\Omega( k^{3/2})$,\nbatches of medium-size with $\\tilde\\Omega(\\sqrt k)$ samples each. However, the\npaper requires that the input distribution for all $k$ subgroups be isotropic\nGaussian, and states that removing this assumption is an ``interesting and\nchallenging problem\". We propose a novel gradient-based algorithm that improves\non the existing results in several ways. It extends the applicability of the\nalgorithm by: (1) allowing the subgroups' underlying input distributions to be\ndifferent, unknown, and heavy-tailed; (2) recovering all subgroups followed by\na significant proportion of batches even for infinite $k$; (3) removing the\nseparation requirement between the regression vectors; (4) reducing the number\nof batches and allowing smaller batch sizes.\n","authors":["Ayush Jain","Rajat Sen","Weihao Kong","Abhimanyu Das","Alon Orlitsky"],"pdf_url":"https://arxiv.org/pdf/2309.01973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01966v1","updated":"2023-09-05T05:39:44Z","published":"2023-09-05T05:39:44Z","title":"AdaPlus: Integrating Nesterov Momentum and Precise Stepsize Adjustment\n on AdamW Basis","summary":" This paper proposes an efficient optimizer called AdaPlus which integrates\nNesterov momentum and precise stepsize adjustment on AdamW basis. AdaPlus\ncombines the advantages of AdamW, Nadam, and AdaBelief and, in particular, does\nnot introduce any extra hyper-parameters. We perform extensive experimental\nevaluations on three machine learning tasks to validate the effectiveness of\nAdaPlus. The experiment results validate that AdaPlus (i) is the best adaptive\nmethod which performs most comparable with (even slightly better than) SGD with\nmomentum on image classification tasks and (ii) outperforms other\nstate-of-the-art optimizers on language modeling tasks and illustrates the\nhighest stability when training GANs. The experiment code of AdaPlus is\navailable at: https://github.com/guanleics/AdaPlus.\n","authors":["Lei Guan"],"pdf_url":"https://arxiv.org/pdf/2309.01966v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.08316v2","updated":"2023-09-05T05:33:46Z","published":"2022-06-16T17:22:40Z","title":"Boosting the Adversarial Transferability of Surrogate Models with Dark\n Knowledge","summary":" Deep neural networks (DNNs) are vulnerable to adversarial examples. And, the\nadversarial examples have transferability, which means that an adversarial\nexample for a DNN model can fool another model with a non-trivial probability.\nThis gave birth to the transfer-based attack where the adversarial examples\ngenerated by a surrogate model are used to conduct black-box attacks. There are\nsome work on generating the adversarial examples from a given surrogate model\nwith better transferability. However, training a special surrogate model to\ngenerate adversarial examples with better transferability is relatively\nunder-explored. This paper proposes a method for training a surrogate model\nwith dark knowledge to boost the transferability of the adversarial examples\ngenerated by the surrogate model. This trained surrogate model is named dark\nsurrogate model (DSM). The proposed method for training a DSM consists of two\nkey components: a teacher model extracting dark knowledge, and the mixing\naugmentation skill enhancing dark knowledge of training data. We conducted\nextensive experiments to show that the proposed method can substantially\nimprove the adversarial transferability of surrogate models across different\narchitectures of surrogate models and optimizers for generating adversarial\nexamples, and it can be applied to other scenarios of transfer-based attack\nthat contain dark knowledge, like face verification. Our code is publicly\navailable at \\url{https://github.com/ydc123/Dark_Surrogate_Model}.\n","authors":["Dingcheng Yang","Zihao Xiao","Wenjian Yu"],"pdf_url":"https://arxiv.org/pdf/2206.08316v2.pdf","comment":"Accepted at 2023 International Conference on Tools with Artificial\n Intelligence (ICTAI)"},{"id":"http://arxiv.org/abs/2309.01950v1","updated":"2023-09-05T04:56:18Z","published":"2023-09-05T04:56:18Z","title":"RADIO: Reference-Agnostic Dubbing Video Synthesis","summary":" One of the most challenging problems in audio-driven talking head generation\nis achieving high-fidelity detail while ensuring precise synchronization. Given\nonly a single reference image, extracting meaningful identity attributes\nbecomes even more challenging, often causing the network to mirror the facial\nand lip structures too closely. To address these issues, we introduce RADIO, a\nframework engineered to yield high-quality dubbed videos regardless of the pose\nor expression in reference images. The key is to modulate the decoder layers\nusing latent space composed of audio and reference features. Additionally, we\nincorporate ViT blocks into the decoder to emphasize high-fidelity details,\nespecially in the lip region. Our experimental results demonstrate that RADIO\ndisplays high synchronization without the loss of fidelity. Especially in harsh\nscenarios where the reference frame deviates significantly from the ground\ntruth, our method outperforms state-of-the-art methods, highlighting its\nrobustness. Pre-trained model and codes will be made public after the review.\n","authors":["Dongyeun Lee","Chaewon Kim","Sangjoon Yu","Jaejun Yoo","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2309.01950v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.01947v1","updated":"2023-09-05T04:47:55Z","published":"2023-09-05T04:47:55Z","title":"TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression\n For On-device ASR Models","summary":" Automatic Speech Recognition (ASR) models need to be optimized for specific\nhardware before they can be deployed on devices. This can be done by tuning the\nmodel's hyperparameters or exploring variations in its architecture.\nRe-training and re-validating models after making these changes can be a\nresource-intensive task. This paper presents TODM (Train Once Deploy Many), a\nnew approach to efficiently train many sizes of hardware-friendly on-device ASR\nmodels with comparable GPU-hours to that of a single training job. TODM\nleverages insights from prior work on Supernet, where Recurrent Neural Network\nTransducer (RNN-T) models share weights within a Supernet. It reduces layer\nsizes and widths of the Supernet to obtain subnetworks, making them smaller\nmodels suitable for all hardware types. We introduce a novel combination of\nthree techniques to improve the outcomes of the TODM Supernet: adaptive\ndropouts, an in-place Alpha-divergence knowledge distillation, and the use of\nScaledAdam optimizer. We validate our approach by comparing Supernet-trained\nversus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using\nLibriSpeech. Results demonstrate that our TODM Supernet either matches or\nsurpasses the performance of manually tuned models by up to a relative of 3%\nbetter in word error rate (WER), while efficiently keeping the cost of training\nmany models at a small constant.\n","authors":["Yuan Shangguan","Haichuan Yang","Danni Li","Chunyang Wu","Yassir Fathullah","Dilin Wang","Ayushi Dalmia","Raghuraman Krishnamoorthi","Ozlem Kalinli","Junteng Jia","Jay Mahadeokar","Xin Lei","Mike Seltzer","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.01947v1.pdf","comment":"Meta AI; Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2302.06763v2","updated":"2023-09-05T04:45:48Z","published":"2023-02-14T00:23:42Z","title":"Breaking the Lower Bound with (Little) Structure: Acceleration in\n Non-Convex Stochastic Optimization with Heavy-Tailed Noise","summary":" We consider the stochastic optimization problem with smooth but not\nnecessarily convex objectives in the heavy-tailed noise regime, where the\nstochastic gradient's noise is assumed to have bounded $p$th moment\n($p\\in(1,2]$). Zhang et al. (2020) is the first to prove the\n$\\Omega(T^{\\frac{1-p}{3p-2}})$ lower bound for convergence (in expectation) and\nprovides a simple clipping algorithm that matches this optimal rate. Cutkosky\nand Mehta (2021) proposes another algorithm, which is shown to achieve the\nnearly optimal high-probability convergence guarantee\n$O(\\log(T/\\delta)T^{\\frac{1-p}{3p-2}})$, where $\\delta$ is the probability of\nfailure. However, this desirable guarantee is only established under the\nadditional assumption that the stochastic gradient itself is bounded in $p$th\nmoment, which fails to hold even for quadratic objectives and centered Gaussian\nnoise.\n In this work, we first improve the analysis of the algorithm in Cutkosky and\nMehta (2021) to obtain the same nearly optimal high-probability convergence\nrate $O(\\log(T/\\delta)T^{\\frac{1-p}{3p-2}})$, without the above-mentioned\nrestrictive assumption. Next, and curiously, we show that one can achieve a\nfaster rate than that dictated by the lower bound\n$\\Omega(T^{\\frac{1-p}{3p-2}})$ with only a tiny bit of structure, i.e., when\nthe objective function $F(x)$ is assumed to be in the form of\n$\\mathbb{E}_{\\Xi\\sim\\mathcal{D}}[f(x,\\Xi)]$, arguably the most widely\napplicable class of stochastic optimization problems. For this class of\nproblems, we propose the first variance-reduced accelerated algorithm and\nestablish that it guarantees a high-probability convergence rate of\n$O(\\log(T/\\delta)T^{\\frac{1-p}{2p-1}})$ under a mild condition, which is faster\nthan $\\Omega(T^{\\frac{1-p}{3p-2}})$. Notably, even when specialized to the\nfinite-variance case, our result yields the (near-)optimal high-probability\nrate $O(\\log(T/\\delta)T^{-1/3})$.\n","authors":["Zijian Liu","Jiawei Zhang","Zhengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2302.06763v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01945v1","updated":"2023-09-05T04:39:34Z","published":"2023-09-05T04:39:34Z","title":"OHQ: On-chip Hardware-aware Quantization","summary":" Quantization emerges as one of the most promising approaches for deploying\nadvanced deep models on resource-constrained hardware. Mixed-precision\nquantization leverages multiple bit-width architectures to unleash the accuracy\nand efficiency potential of quantized models. However, existing mixed-precision\nquantization suffers exhaustive search space that causes immense computational\noverhead. The quantization process thus relies on separate high-performance\ndevices rather than locally, which also leads to a significant gap between the\nconsidered hardware metrics and the real deployment.In this paper, we propose\nan On-chip Hardware-aware Quantization (OHQ) framework that performs\nhardware-aware mixed-precision quantization without accessing online devices.\nFirst, we construct the On-chip Quantization Awareness (OQA) pipeline, enabling\nperceive the actual efficiency metrics of the quantization operator on the\nhardware.Second, we propose Mask-guided Quantization Estimation (MQE) technique\nto efficiently estimate the accuracy metrics of operators under the constraints\nof on-chip-level computing power.By synthesizing network and hardware insights\nthrough linear programming, we obtain optimized bit-width configurations.\nNotably, the quantization process occurs on-chip entirely without any\nadditional computing devices and data access. We demonstrate accelerated\ninference after quantization for various architectures and compression ratios,\nachieving 70% and 73% accuracy for ResNet-18 and MobileNetV3, respectively. OHQ\nimproves latency by 15~30% compared to INT8 on deployment.\n","authors":["Wei Huang","Haotong Qin","Yangdong Liu","Jingzhuo Liang","Yifu Ding","Ying Li","Xianglong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.01945v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.01941v1","updated":"2023-09-05T04:17:37Z","published":"2023-09-05T04:17:37Z","title":"Dynamic Brain Transformer with Multi-level Attention for Functional\n Brain Network Analysis","summary":" Recent neuroimaging studies have highlighted the importance of\nnetwork-centric brain analysis, particularly with functional magnetic resonance\nimaging. The emergence of Deep Neural Networks has fostered a substantial\ninterest in predicting clinical outcomes and categorizing individuals based on\nbrain networks. However, the conventional approach involving static brain\nnetwork analysis offers limited potential in capturing the dynamism of brain\nfunction. Although recent studies have attempted to harness dynamic brain\nnetworks, their high dimensionality and complexity present substantial\nchallenges. This paper proposes a novel methodology, Dynamic bRAin Transformer\n(DART), which combines static and dynamic brain networks for more effective and\nnuanced brain function analysis. Our model uses the static brain network as a\nbaseline, integrating dynamic brain networks to enhance performance against\ntraditional methods. We innovatively employ attention mechanisms, enhancing\nmodel explainability and exploiting the dynamic brain network's temporal\nvariations. The proposed approach offers a robust solution to the low\nsignal-to-noise ratio of blood-oxygen-level-dependent signals, a recurring\nissue in direct DNN modeling. It also provides valuable insights into which\nbrain circuits or dynamic networks contribute more to final predictions. As\nsuch, DRAT shows a promising direction in neuroimaging studies, contributing to\nthe comprehensive understanding of brain organization and the role of neural\ncircuits.\n","authors":["Xuan Kan","Antonio Aodong Chen Gu","Hejie Cui","Ying Guo","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2309.01941v1.pdf","comment":"Accepted to IEEE BHI 2023"},{"id":"http://arxiv.org/abs/2308.11676v2","updated":"2023-09-05T03:57:57Z","published":"2023-08-22T13:18:13Z","title":"Does Misclassifying Non-confounding Covariates as Confounders Affect the\n Causal Inference within the Potential Outcomes Framework?","summary":" The Potential Outcome Framework (POF) plays a prominent role in the field of\ncausal inference. Most causal inference models based on the POF (CIMs-POF) are\ndesigned for eliminating confounding bias and default to an underlying\nassumption of Confounding Covariates. This assumption posits that the\ncovariates consist solely of confounders. However, the assumption of\nConfounding Covariates is challenging to maintain in practice, particularly\nwhen dealing with high-dimensional covariates. While certain methods have been\nproposed to differentiate the distinct components of covariates prior to\nconducting causal inference, the consequences of treating non-confounding\ncovariates as confounders remain unclear. This ambiguity poses a potential risk\nwhen conducting causal inference in practical scenarios. In this paper, we\npresent a unified graphical framework for the CIMs-POF, which greatly enhances\nthe comprehension of these models' underlying principles. Using this graphical\nframework, we quantitatively analyze the extent to which the inference\nperformance of CIMs-POF is influenced when incorporating various types of\nnon-confounding covariates, such as instrumental variables, mediators,\ncolliders, and adjustment variables. The key findings are: in the task of\neliminating confounding bias, the optimal scenario is for the covariates to\nexclusively encompass confounders; in the subsequent task of inferring\ncounterfactual outcomes, the adjustment variables contribute to more accurate\ninferences. Furthermore, extensive experiments conducted on synthetic datasets\nconsistently validate these theoretical conclusions.\n","authors":["Yonghe Zhao","Qiang Huang","Shuai Fu","Huiyan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.11676v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.16834v2","updated":"2023-09-05T03:51:50Z","published":"2023-07-28T17:16:57Z","title":"Benchmarking Jetson Edge Devices with an End-to-end Video-based Anomaly\n Detection System","summary":" Innovative enhancement in embedded system platforms, specifically hardware\naccelerations, significantly influence the application of deep learning in\nreal-world scenarios. These innovations translate human labor efforts into\nautomated intelligent systems employed in various areas such as autonomous\ndriving, robotics, Internet-of-Things (IoT), and numerous other impactful\napplications. NVIDIA's Jetson platform is one of the pioneers in offering\noptimal performance regarding energy efficiency and throughput in the execution\nof deep learning algorithms. Previously, most benchmarking analysis was based\non 2D images with a single deep learning model for each comparison result. In\nthis paper, we implement an end-to-end video-based crime-scene anomaly\ndetection system inputting from surveillance videos and the system is deployed\nand completely operates on multiple Jetson edge devices (Nano, AGX Xavier, Orin\nNano). The comparison analysis includes the integration of Torch-TensorRT as a\nsoftware developer kit from NVIDIA for the model performance optimisation. The\nsystem is built based on the PySlowfast open-source project from Facebook as\nthe coding template. The end-to-end system process comprises the videos from\ncamera, data preprocessing pipeline, feature extractor and the anomaly\ndetection. We provide the experience of an AI-based system deployment on\nvarious Jetson Edge devices with Docker technology. Regarding anomaly\ndetectors, a weakly supervised video-based deep learning model called Robust\nTemporal Feature Magnitude Learning (RTFM) is applied in the system. The\napproach system reaches 47.56 frames per second (FPS) inference speed on a\nJetson edge device with only 3.11 GB RAM usage total. We also discover the\npromising Jetson device that the AI system achieves 15% better performance than\nthe previous version of Jetson devices while consuming 50% less energy power.\n","authors":["Hoang Viet Pham","Thinh Gia Tran","Chuong Dinh Le","An Dinh Le","Hien Bich Vo"],"pdf_url":"https://arxiv.org/pdf/2307.16834v2.pdf","comment":"18 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2309.01933v1","updated":"2023-09-05T03:42:46Z","published":"2023-09-05T03:42:46Z","title":"Provably safe systems: the only path to controllable AGI","summary":" We describe a path to humanity safely thriving with powerful Artificial\nGeneral Intelligences (AGIs) by building them to provably satisfy\nhuman-specified requirements. We argue that this will soon be technically\nfeasible using advanced AI for formal verification and mechanistic\ninterpretability. We further argue that it is the only path which guarantees\nsafe controlled AGI. We end with a list of challenge problems whose solution\nwould contribute to this positive outcome and invite readers to join in this\nwork.\n","authors":["Max Tegmark","Steve Omohundro"],"pdf_url":"https://arxiv.org/pdf/2309.01933v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.09780v2","updated":"2023-09-05T03:32:19Z","published":"2023-08-04T05:43:32Z","title":"Event-based Dynamic Graph Representation Learning for Patent Application\n Trend Prediction","summary":" Accurate prediction of what types of patents that companies will apply for in\nthe next period of time can figure out their development strategies and help\nthem discover potential partners or competitors in advance. Although important,\nthis problem has been rarely studied in previous research due to the challenges\nin modelling companies' continuously evolving preferences and capturing the\nsemantic correlations of classification codes. To fill in this gap, we propose\nan event-based dynamic graph learning framework for patent application trend\nprediction. In particular, our method is founded on the memorable\nrepresentations of both companies and patent classification codes. When a new\npatent is observed, the representations of the related companies and\nclassification codes are updated according to the historical memories and the\ncurrently encoded messages. Moreover, a hierarchical message passing mechanism\nis provided to capture the semantic proximities of patent classification codes\nby updating their representations along the hierarchical taxonomy. Finally, the\npatent application trend is predicted by aggregating the representations of the\ntarget company and classification codes from static, dynamic, and hierarchical\nperspectives. Experiments on real-world data demonstrate the effectiveness of\nour approach under various experimental conditions, and also reveal the\nabilities of our method in learning semantics of classification codes and\ntracking technology developing trajectories of companies.\n","authors":["Tao Zou","Le Yu","Leilei Sun","Bowen Du","Deqing Wang","Fuzhen Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.09780v2.pdf","comment":"Accepted by the TKDE journal"},{"id":"http://arxiv.org/abs/2309.01922v1","updated":"2023-09-05T03:22:46Z","published":"2023-09-05T03:22:46Z","title":"Regret Analysis of Policy Gradient Algorithm for Infinite Horizon\n Average Reward Markov Decision Processes","summary":" In this paper, we consider an infinite horizon average reward Markov Decision\nProcess (MDP). Distinguishing itself from existing works within this context,\nour approach harnesses the power of the general policy gradient-based\nalgorithm, liberating it from the constraints of assuming a linear MDP\nstructure. We propose a policy gradient-based algorithm and show its global\nconvergence property. We then prove that the proposed algorithm has\n$\\tilde{\\mathcal{O}}({T}^{3/4})$ regret. Remarkably, this paper marks a\npioneering effort by presenting the first exploration into regret-bound\ncomputation for the general parameterized policy gradient algorithm in the\ncontext of average reward scenarios.\n","authors":["Qinbo Bai","Washim Uddin Mondal","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2309.01922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01918v1","updated":"2023-09-05T03:14:39Z","published":"2023-09-05T03:14:39Z","title":"RoboAgent: Generalization and Efficiency in Robot Manipulation via\n Semantic Augmentations and Action Chunking","summary":" The grand aim of having a single robot that can manipulate arbitrary objects\nin diverse settings is at odds with the paucity of robotics datasets. Acquiring\nand growing such datasets is strenuous due to manual efforts, operational\ncosts, and safety challenges. A path toward such an universal agent would\nrequire a structured framework capable of wide generalization but trained\nwithin a reasonable data budget. In this paper, we develop an efficient system\n(RoboAgent) for training universal agents capable of multi-task manipulation\nskills using (a) semantic augmentations that can rapidly multiply existing\ndatasets and (b) action representations that can extract performant policies\nwith small yet diverse multi-modal datasets without overfitting. In addition,\nreliable task conditioning and an expressive policy architecture enable our\nagent to exhibit a diverse repertoire of skills in novel situations specified\nusing language commands. Using merely 7500 demonstrations, we are able to train\na single agent capable of 12 unique skills, and demonstrate its generalization\nover 38 tasks spread across common daily activities in diverse kitchen scenes.\nOn average, RoboAgent outperforms prior methods by over 40% in unseen\nsituations while being more sample efficient and being amenable to capability\nimprovements and extensions through fine-tuning. Videos at\nhttps://robopen.github.io/\n","authors":["Homanga Bharadhwaj","Jay Vakil","Mohit Sharma","Abhinav Gupta","Shubham Tulsiani","Vikash Kumar"],"pdf_url":"https://arxiv.org/pdf/2309.01918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01909v1","updated":"2023-09-05T02:45:18Z","published":"2023-09-05T02:45:18Z","title":"A Survey on Physics Informed Reinforcement Learning: Review and Open\n Problems","summary":" The inclusion of physical information in machine learning frameworks has\nrevolutionized many application areas. This involves enhancing the learning\nprocess by incorporating physical constraints and adhering to physical laws. In\nthis work we explore their utility for reinforcement learning applications. We\npresent a thorough review of the literature on incorporating physics\ninformation, as known as physics priors, in reinforcement learning approaches,\ncommonly referred to as physics-informed reinforcement learning (PIRL). We\nintroduce a novel taxonomy with the reinforcement learning pipeline as the\nbackbone to classify existing works, compare and contrast them, and derive\ncrucial insights. Existing works are analyzed with regard to the\nrepresentation/ form of the governing physics modeled for integration, their\nspecific contribution to the typical reinforcement learning architecture, and\ntheir connection to the underlying reinforcement learning pipeline stages. We\nalso identify core learning architectures and physics incorporation biases\n(i.e., observational, inductive and learning) of existing PIRL approaches and\nuse them to further categorize the works for better understanding and\nadaptation. By providing a comprehensive perspective on the implementation of\nthe physics-informed capability, the taxonomy presents a cohesive approach to\nPIRL. It identifies the areas where this approach has been applied, as well as\nthe gaps and opportunities that exist. Additionally, the taxonomy sheds light\non unresolved issues and challenges, which can guide future research. This\nnascent field holds great potential for enhancing reinforcement learning\nalgorithms by increasing their physical plausibility, precision, data\nefficiency, and applicability in real-world scenarios.\n","authors":["Chayan Banerjee","Kien Nguyen","Clinton Fookes","Maziar Raissi"],"pdf_url":"https://arxiv.org/pdf/2309.01909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11148v2","updated":"2023-09-05T02:28:49Z","published":"2023-08-22T03:10:40Z","title":"LLaMA-Reviewer: Advancing Code Review Automation with Large Language\n Models through Parameter-Efficient Fine-Tuning","summary":" The automation of code review activities, a long-standing pursuit in software\nengineering, has been primarily addressed by numerous domain-specific\npre-trained models. Despite their success, these models frequently demand\nextensive resources for pre-training from scratch. In contrast, Large Language\nModels (LLMs) provide an intriguing alternative, given their remarkable\ncapabilities when supplemented with domain-specific knowledge. However, their\npotential for automating code review tasks remains largely unexplored.\n In response to this research gap, we present LLaMA-Reviewer, an innovative\nframework that leverages the capabilities of LLaMA, a popular LLM, in the realm\nof code review. Mindful of resource constraints, this framework employs\nparameter-efficient fine-tuning (PEFT) methods, delivering high performance\nwhile using less than 1% of trainable parameters.\n An extensive evaluation of LLaMA-Reviewer is conducted on two diverse,\npublicly available datasets. Notably, even with the smallest LLaMA base model\nconsisting of 6.7B parameters and a limited number of tuning epochs,\nLLaMA-Reviewer equals the performance of existing code-review-focused models.\n The ablation experiments provide insights into the influence of various\nfine-tuning process components, including input representation, instruction\ntuning, and different PEFT methods. To foster continuous progress in this\nfield, the code and all PEFT-weight plugins have been made open-source.\n","authors":["Junyi Lu","Lei Yu","Xiaojia Li","Li Yang","Chun Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.11148v2.pdf","comment":"Accepted to the 34th IEEE International Symposium on Software\n Reliability Engineering (ISSRE 2023)"},{"id":"http://arxiv.org/abs/2309.01897v1","updated":"2023-09-05T02:15:08Z","published":"2023-09-05T02:15:08Z","title":"Inferring Actual Treatment Pathways from Patient Records","summary":" Treatment pathways are step-by-step plans outlining the recommended medical\ncare for specific diseases; they get revised when different treatments are\nfound to improve patient outcomes. Examining health records is an important\npart of this revision process, but inferring patients' actual treatments from\nhealth data is challenging due to complex event-coding schemes and the absence\nof pathway-related annotations. This study aims to infer the actual treatment\nsteps for a particular patient group from administrative health records (AHR) -\na common form of tabular healthcare data - and address several technique- and\nmethodology-based gaps in treatment pathway-inference research. We introduce\nDefrag, a method for examining AHRs to infer the real-world treatment steps for\na particular patient group. Defrag learns the semantic and temporal meaning of\nhealthcare event sequences, allowing it to reliably infer treatment steps from\ncomplex healthcare data. To our knowledge, Defrag is the first\npathway-inference method to utilise a neural network (NN), an approach made\npossible by a novel, self-supervised learning objective. We also developed a\ntesting and validation framework for pathway inference, which we use to\ncharacterise and evaluate Defrag's pathway inference ability and compare\nagainst baselines. We demonstrate Defrag's effectiveness by identifying\nbest-practice pathway fragments for breast cancer, lung cancer, and melanoma in\npublic healthcare records. Additionally, we use synthetic data experiments to\ndemonstrate the characteristics of the Defrag method, and to compare Defrag to\nseveral baselines where it significantly outperforms non-NN-based methods.\nDefrag significantly outperforms several existing pathway-inference methods and\noffers an innovative and effective approach for inferring treatment pathways\nfrom AHRs. Open-source code is provided to encourage further research in this\narea.\n","authors":["Adrian Wilkins-Caruana","Madhushi Bandara","Katarzyna Musial","Daniel Catchpoole","Paul J. Kennedy"],"pdf_url":"https://arxiv.org/pdf/2309.01897v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01886v1","updated":"2023-09-05T01:40:01Z","published":"2023-09-05T01:40:01Z","title":"Extended Symmetry Preserving Attention Networks for LHC Analysis","summary":" Reconstructing unstable heavy particles requires sophisticated techniques to\nsift through the large number of possible permutations for assignment of\ndetector objects to partons. An approach based on a generalized attention\nmechanism, symmetry preserving attention networks (SPANet), has been previously\napplied to top quark pair decays at the Large Hadron Collider, which produce\nsix hadronic jets. Here we extend the SPANet architecture to consider multiple\ninput streams, such as leptons, as well as global event features, such as the\nmissing transverse momentum. In addition, we provide regression and\nclassification outputs to supplement the parton assignment. We explore the\nperformance of the extended capability of SPANet in the context of\nsemi-leptonic decays of top quark pairs as well as top quark pairs produced in\nassociation with a Higgs boson. We find significant improvements in the power\nof three representative studies: search for ttH, measurement of the top quark\nmass and a search for a heavy Z' decaying to top quark pairs. We present\nablation studies to provide insight on what the network has learned in each\ncase.\n","authors":["Michael James Fenton","Alexander Shmakov","Hideki Okawa","Yuji Li","Ko-Yang Hsiao","Shih-Chieh Hsu","Daniel Whiteson","Pierre Baldi"],"pdf_url":"https://arxiv.org/pdf/2309.01886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01885v1","updated":"2023-09-05T01:39:09Z","published":"2023-09-05T01:39:09Z","title":"QuantEase: Optimization-based Quantization for Language Models -- An\n Efficient and Intuitive Algorithm","summary":" With the rising popularity of Large Language Models (LLMs), there has been an\nincreasing interest in compression techniques that enable their efficient\ndeployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs.\nDrawing from recent advances, our work introduces QuantEase, a layer-wise\nquantization framework where individual layers undergo separate quantization.\nThe problem is framed as a discrete-structured non-convex optimization,\nprompting the development of algorithms rooted in Coordinate Descent (CD)\ntechniques. These CD-based methods provide high-quality solutions to the\ncomplex non-convex layer-wise quantization problems. Notably, our CD-based\napproach features straightforward updates, relying solely on matrix and vector\noperations, circumventing the need for matrix inversion or decomposition. We\nalso explore an outlier-aware variant of our approach, allowing for retaining\nsignificant weights (outliers) with complete precision. Our proposal attains\nstate-of-the-art performance in terms of perplexity and zero-shot accuracy in\nempirical evaluations across various LLMs and datasets, with relative\nimprovements up to 15% over methods such as GPTQ. Particularly noteworthy is\nour outlier-aware algorithm's capability to achieve near or sub-3-bit\nquantization of LLMs with an acceptable drop in accuracy, obviating the need\nfor non-uniform quantization or grouping techniques, improving upon methods\nsuch as SpQR by up to two times in terms of perplexity.\n","authors":["Kayhan Behdin","Ayan Acharya","Aman Gupta","Sathiya Keerthi","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2309.01885v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01884v1","updated":"2023-09-05T01:22:19Z","published":"2023-09-05T01:22:19Z","title":"Task Generalization with Stability Guarantees via Elastic Dynamical\n System Motion Policies","summary":" Dynamical System (DS) based Learning from Demonstration (LfD) allows learning\nof reactive motion policies with stability and convergence guarantees from a\nfew trajectories. Yet, current DS learning techniques lack the flexibility to\ngeneralize to new task instances as they ignore explicit task parameters that\ninherently change the underlying trajectories. In this work, we propose\nElastic-DS, a novel DS learning, and generalization approach that embeds task\nparameters into the Gaussian Mixture Model (GMM) based Linear Parameter Varying\n(LPV) DS formulation. Central to our approach is the Elastic-GMM, a GMM\nconstrained to SE(3) task-relevant frames. Given a new task instance/context,\nthe Elastic-GMM is transformed with Laplacian Editing and used to re-estimate\nthe LPV-DS policy. Elastic-DS is compositional in nature and can be used to\nconstruct flexible multi-step tasks. We showcase its strength on a myriad of\nsimulated and real-robot experiments while preserving desirable\ncontrol-theoretic guarantees. Supplementary videos can be found at\nhttps://sites.google.com/view/elastic-ds\n","authors":["Tianyu Li","Nadia Figueroa"],"pdf_url":"https://arxiv.org/pdf/2309.01884v1.pdf","comment":"Accepted to CoRL 2023"},{"id":"http://arxiv.org/abs/2307.08924v2","updated":"2023-09-05T01:15:42Z","published":"2023-07-18T01:53:18Z","title":"Learning to Sample Tasks for Meta Learning","summary":" Through experiments on various meta-learning methods, task samplers, and\nfew-shot learning tasks, this paper arrives at three conclusions. Firstly,\nthere are no universal task sampling strategies to guarantee the performance of\nmeta-learning models. Secondly, task diversity can cause the models to either\nunderfit or overfit during training. Lastly, the generalization performance of\nthe models are influenced by task divergence, task entropy, and task\ndifficulty. In response to these findings, we propose a novel task sampler\ncalled Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes\ntask divergence, task entropy, and task difficulty to sample tasks. To optimize\nASr, we rethink and propose a simple and general meta-learning algorithm.\nFinally, a large number of empirical experiments demonstrate the effectiveness\nof the proposed ASr.\n","authors":["Jingyao Wang","Zeen Song","Xingzhe Su","Lingyu Si","Hongwei Dong","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08924v2.pdf","comment":"10 pages, 7 tables, 3 figures"},{"id":"http://arxiv.org/abs/2309.01875v1","updated":"2023-09-05T00:58:17Z","published":"2023-09-05T00:58:17Z","title":"Gradient Domain Diffusion Models for Image Synthesis","summary":" Diffusion models are getting popular in generative image and video synthesis.\nHowever, due to the diffusion process, they require a large number of steps to\nconverge. To tackle this issue, in this paper, we propose to perform the\ndiffusion process in the gradient domain, where the convergence becomes faster.\nThere are two reasons. First, thanks to the Poisson equation, the gradient\ndomain is mathematically equivalent to the original image domain. Therefore,\neach diffusion step in the image domain has a unique corresponding gradient\ndomain representation. Second, the gradient domain is much sparser than the\nimage domain. As a result, gradient domain diffusion models converge faster.\nSeveral numerical experiments confirm that the gradient domain diffusion models\nare more efficient than the original diffusion models. The proposed method can\nbe applied in a wide range of applications such as image processing, computer\nvision and machine learning tasks.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2309.01875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08600v2","updated":"2023-09-05T00:44:33Z","published":"2023-04-17T20:38:07Z","title":"RS2G: Data-Driven Scene-Graph Extraction and Embedding for Robust\n Autonomous Perception and Scenario Understanding","summary":" Effectively capturing intricate interactions among road users is of critical\nimportance to achieving safe navigation for autonomous vehicles. While graph\nlearning (GL) has emerged as a promising approach to tackle this challenge,\nexisting GL models rely on predefined domain-specific graph extraction rules\nthat often fail in real-world drastically changing scenarios. Additionally,\nthese graph extraction rules severely impede the capability of existing GL\nmethods to generalize knowledge across domains. To address this issue, we\npropose RoadScene2Graph (RS2G), an innovative autonomous scenario understanding\nframework with a novel data-driven graph extraction and modeling approach that\ndynamically captures the diverse relations among road users. Our evaluations\ndemonstrate that on average RS2G outperforms the state-of-the-art (SOTA)\nrule-based graph extraction method by 4.47% and the SOTA deep learning model by\n22.19% in subjective risk assessment. More importantly, RS2G delivers notably\nbetter performance in transferring knowledge gained from simulation\nenvironments to unseen real-world scenarios.\n","authors":["Junyao Wang","Arnav Vaibhav Malawade","Junhong Zhou","Shih-Yuan Yu","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2304.08600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01866v1","updated":"2023-09-05T00:14:12Z","published":"2023-09-05T00:14:12Z","title":"Efficient Query-Based Attack against ML-Based Android Malware Detection\n under Zero Knowledge Setting","summary":" The widespread adoption of the Android operating system has made malicious\nAndroid applications an appealing target for attackers. Machine learning-based\n(ML-based) Android malware detection (AMD) methods are crucial in addressing\nthis problem; however, their vulnerability to adversarial examples raises\nconcerns. Current attacks against ML-based AMD methods demonstrate remarkable\nperformance but rely on strong assumptions that may not be realistic in\nreal-world scenarios, e.g., the knowledge requirements about feature space,\nmodel parameters, and training dataset. To address this limitation, we\nintroduce AdvDroidZero, an efficient query-based attack framework against\nML-based AMD methods that operates under the zero knowledge setting. Our\nextensive evaluation shows that AdvDroidZero is effective against various\nmainstream ML-based AMD methods, in particular, state-of-the-art such methods\nand real-world antivirus solutions.\n","authors":["Ping He","Yifan Xia","Xuhong Zhang","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2309.01866v1.pdf","comment":"To Appear in the ACM Conference on Computer and Communications\n Security, November, 2023"},{"id":"http://arxiv.org/abs/2309.02623v1","updated":"2023-09-05T23:49:46Z","published":"2023-09-05T23:49:46Z","title":"Superclustering by finding statistically significant separable groups of\n optimal gaussian clusters","summary":" The paper presents the algorithm for clustering a dataset by grouping the\noptimal, from the point of view of the BIC criterion, number of Gaussian\nclusters into the optimal, from the point of view of their statistical\nseparability, superclusters.\n The algorithm consists of three stages: representation of the dataset as a\nmixture of Gaussian distributions - clusters, which number is determined based\non the minimum of the BIC criterion; using the Mahalanobis distance, to\nestimate the distances between the clusters and cluster sizes; combining the\nresulting clusters into superclusters using the DBSCAN method by finding its\nhyperparameter (maximum distance) providing maximum value of introduced matrix\nquality criterion at maximum number of superclusters. The matrix quality\ncriterion corresponds to the proportion of statistically significant separated\nsuperclusters among all found superclusters.\n The algorithm has only one hyperparameter - statistical significance level,\nand automatically detects optimal number and shape of superclusters based of\nstatistical hypothesis testing approach. The algorithm demonstrates a good\nresults on test datasets in noise and noiseless situations. An essential\nadvantage of the algorithm is its ability to predict correct supercluster for\nnew data based on already trained clusterer and perform soft (fuzzy)\nclustering. The disadvantages of the algorithm are: its low speed and\nstochastic nature of the final clustering. It requires a sufficiently large\ndataset for clustering, which is typical for many statistical methods.\n","authors":["Oleg I. Berngardt"],"pdf_url":"https://arxiv.org/pdf/2309.02623v1.pdf","comment":"32 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.02617v1","updated":"2023-09-05T23:33:39Z","published":"2023-09-05T23:33:39Z","title":"Compressing Vision Transformers for Low-Resource Visual Learning","summary":" Vision transformer (ViT) and its variants have swept through visual learning\nleaderboards and offer state-of-the-art accuracy in tasks such as image\nclassification, object detection, and semantic segmentation by attending to\ndifferent parts of the visual input and capturing long-range spatial\ndependencies. However, these models are large and computation-heavy. For\ninstance, the recently proposed ViT-B model has 86M parameters making it\nimpractical for deployment on resource-constrained devices. As a result, their\ndeployment on mobile and edge scenarios is limited. In our work, we aim to take\na step toward bringing vision transformers to the edge by utilizing popular\nmodel compression techniques such as distillation, pruning, and quantization.\n Our chosen application environment is an unmanned aerial vehicle (UAV) that\nis battery-powered and memory-constrained, carrying a single-board computer on\nthe scale of an NVIDIA Jetson Nano with 4GB of RAM. On the other hand, the UAV\nrequires high accuracy close to that of state-of-the-art ViTs to ensure safe\nobject avoidance in autonomous navigation, or correct localization of humans in\nsearch-and-rescue. Inference latency should also be minimized given the\napplication requirements. Hence, our target is to enable rapid inference of a\nvision transformer on an NVIDIA Jetson Nano (4GB) with minimal accuracy loss.\nThis allows us to deploy ViTs on resource-constrained devices, opening up new\npossibilities in surveillance, environmental monitoring, etc. Our\nimplementation is made available at https://github.com/chensy7/efficient-vit.\n","authors":["Eric Youn","Sai Mitheran J","Sanjana Prabhu","Siyuan Chen"],"pdf_url":"https://arxiv.org/pdf/2309.02617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19347v2","updated":"2023-09-05T23:26:02Z","published":"2023-05-30T18:15:05Z","title":"Machine Learning Based IoT Adaptive Architecture for Epilepsy Seizure\n Detection: Anatomy and Analysis","summary":" A seizure tracking system is crucial for monitoring and evaluating epilepsy\ntreatments. Caretaker seizure diaries are used in epilepsy care today, but\nclinical seizure monitoring may miss seizures. Monitoring devices that can be\nworn may be better tolerated and more suitable for long-term ambulatory use.\nMany techniques and methods are proposed for seizure detection; However,\nsimplicity and affordability are key concepts for daily use while preserving\nthe accuracy of the detection. In this study, we propose a versal, affordable\nnoninvasive based on a simple real-time k-Nearest-Neighbors (kNN) machine\nlearning that can be customized and adapted to individual users in less than\nfour seconds of training time; the system was verified and validated using 500\nsubjects, with seizure detection data sampled at 178 Hz, the operated with a\nmean accuracy of (94.5%).\n","authors":["Zag ElSayed","Murat Ozer","Nelly Elsayed","Ahmed Abdelgawad"],"pdf_url":"https://arxiv.org/pdf/2305.19347v2.pdf","comment":"Under review, 5 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.02616v1","updated":"2023-09-05T23:24:56Z","published":"2023-09-05T23:24:56Z","title":"Generative AI-aided Joint Training-free Secure Semantic Communications\n via Multi-modal Prompts","summary":" Semantic communication (SemCom) holds promise for reducing network resource\nconsumption while achieving the communications goal. However, the computational\noverheads in jointly training semantic encoders and decoders-and the subsequent\ndeployment in network devices-are overlooked. Recent advances in Generative\nartificial intelligence (GAI) offer a potential solution. The robust learning\nabilities of GAI models indicate that semantic decoders can reconstruct source\nmessages using a limited amount of semantic information, e.g., prompts, without\njoint training with the semantic encoder. A notable challenge, however, is the\ninstability introduced by GAI's diverse generation ability. This instability,\nevident in outputs like text-generated images, limits the direct application of\nGAI in scenarios demanding accurate message recovery, such as face image\ntransmission. To solve the above problems, this paper proposes a GAI-aided\nSemCom system with multi-model prompts for accurate content decoding. Moreover,\nin response to security concerns, we introduce the application of covert\ncommunications aided by a friendly jammer. The system jointly optimizes the\ndiffusion step, jamming, and transmitting power with the aid of the generative\ndiffusion models, enabling successful and secure transmission of the source\nmessages.\n","authors":["Hongyang Du","Guangyuan Liu","Dusit Niyato","Jiayi Zhang","Jiawen Kang","Zehui Xiong","Bo Ai","Dong In Kim"],"pdf_url":"https://arxiv.org/pdf/2309.02616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02615v1","updated":"2023-09-05T23:24:34Z","published":"2023-09-05T23:24:34Z","title":"Generative Algorithms for Fusion of Physics-Based Wildfire Spread Models\n with Satellite Data for Initializing Wildfire Forecasts","summary":" Increases in wildfire activity and the resulting impacts have prompted the\ndevelopment of high-resolution wildfire behavior models for forecasting fire\nspread. Recent progress in using satellites to detect fire locations further\nprovides the opportunity to use measurements to improve fire spread forecasts\nfrom numerical models through data assimilation. This work develops a method\nfor inferring the history of a wildfire from satellite measurements, providing\nthe necessary information to initialize coupled atmosphere-wildfire models from\na measured wildfire state in a physics-informed approach. The fire arrival\ntime, which is the time the fire reaches a given spatial location, acts as a\nsuccinct representation of the history of a wildfire. In this work, a\nconditional Wasserstein Generative Adversarial Network (cWGAN), trained with\nWRF-SFIRE simulations, is used to infer the fire arrival time from satellite\nactive fire data. The cWGAN is used to produce samples of likely fire arrival\ntimes from the conditional distribution of arrival times given satellite active\nfire detections. Samples produced by the cWGAN are further used to assess the\nuncertainty of predictions. The cWGAN is tested on four California wildfires\noccurring between 2020 and 2022, and predictions for fire extent are compared\nagainst high resolution airborne infrared measurements. Further, the predicted\nignition times are compared with reported ignition times. An average Sorensen's\ncoefficient of 0.81 for the fire perimeters and an average ignition time error\nof 32 minutes suggest that the method is highly accurate.\n","authors":["Bryan Shaddy","Deep Ray","Angel Farguell","Valentina Calaza","Jan Mandel","James Haley","Kyle Hilburn","Derek V. Mallia","Adam Kochanski","Assad Oberai"],"pdf_url":"https://arxiv.org/pdf/2309.02615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02614v1","updated":"2023-09-05T23:19:13Z","published":"2023-09-05T23:19:13Z","title":"Utilizing Generative Adversarial Networks for Stable Structure\n Generation in Angry Birds","summary":" This paper investigates the suitability of using Generative Adversarial\nNetworks (GANs) to generate stable structures for the physics-based puzzle game\nAngry Birds. While previous applications of GANs for level generation have been\nmostly limited to tile-based representations, this paper explores their\nsuitability for creating stable structures made from multiple smaller blocks.\nThis includes a detailed encoding/decoding process for converting between Angry\nBirds level descriptions and a suitable grid-based representation, as well as\nutilizing state-of-the-art GAN architectures and training methods to produce\nnew structure designs. Our results show that GANs can be successfully applied\nto generate a varied range of complex and stable Angry Birds structures.\n","authors":["Frederic Abraham","Matthew Stephenson"],"pdf_url":"https://arxiv.org/pdf/2309.02614v1.pdf","comment":"11 pages, 10 figures, 2 tables, Accepted at the 19th AAAI Conference\n on Artificial Intelligence and Interactive Digital Entertainment (AIIDE 23)"},{"id":"http://arxiv.org/abs/2306.10587v2","updated":"2023-09-05T23:11:42Z","published":"2023-06-18T15:50:57Z","title":"Acceleration in Policy Optimization","summary":" We work towards a unifying paradigm for accelerating policy optimization\nmethods in reinforcement learning (RL) by integrating foresight in the policy\nimprovement step via optimistic and adaptive updates. Leveraging the connection\nbetween policy iteration and policy gradient methods, we view policy\noptimization algorithms as iteratively solving a sequence of surrogate\nobjectives, local lower bounds on the original objective. We define optimism as\npredictive modelling of the future behavior of a policy, and adaptivity as\ntaking immediate and anticipatory corrective actions to mitigate accumulating\nerrors from overshooting predictions or delayed responses to change. We use\nthis shared lens to jointly express other well-known algorithms, including\nmodel-based policy improvement based on forward search, and optimistic\nmeta-learning algorithms. We analyze properties of this formulation, and show\nconnections to other accelerated optimization algorithms. Then, we design an\noptimistic policy gradient algorithm, adaptive via meta-gradient learning, and\nempirically highlight several design choices pertaining to acceleration, in an\nillustrative task.\n","authors":["Veronica Chelu","Tom Zahavy","Arthur Guez","Doina Precup","Sebastian Flennerhag"],"pdf_url":"https://arxiv.org/pdf/2306.10587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02610v1","updated":"2023-09-05T22:55:10Z","published":"2023-09-05T22:55:10Z","title":"T-SaS: Toward Shift-aware Dynamic Adaptation for Streaming Data","summary":" In many real-world scenarios, distribution shifts exist in the streaming data\nacross time steps. Many complex sequential data can be effectively divided into\ndistinct regimes that exhibit persistent dynamics. Discovering the shifted\nbehaviors and the evolving patterns underlying the streaming data are important\nto understand the dynamic system. Existing methods typically train one robust\nmodel to work for the evolving data of distinct distributions or sequentially\nadapt the model utilizing explicitly given regime boundaries. However, there\nare two challenges: (1) shifts in data streams could happen drastically and\nabruptly without precursors. Boundaries of distribution shifts are usually\nunavailable, and (2) training a shared model for all domains could fail to\ncapture varying patterns. This paper aims to solve the problem of sequential\ndata modeling in the presence of sudden distribution shifts that occur without\nany precursors. Specifically, we design a Bayesian framework, dubbed as T-SaS,\nwith a discrete distribution-modeling variable to capture abrupt shifts of\ndata. Then, we design a model that enable adaptation with dynamic network\nselection conditioned on that discrete variable. The proposed method learns\nspecific model parameters for each distribution by learning which neurons\nshould be activated in the full network. A dynamic masking strategy is adopted\nhere to support inter-distribution transfer through the overlapping of a set of\nsparse networks. Extensive experiments show that our proposed method is\nsuperior in both accurately detecting shift boundaries to get segments of\nvarying distributions and effectively adapting to downstream forecast or\nclassification tasks.\n","authors":["Weijieying Ren","Tianxiang Zhao","Wei Qin","Kunpeng Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02610v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2309.02606v1","updated":"2023-09-05T22:33:02Z","published":"2023-09-05T22:33:02Z","title":"Distributed Variational Inference for Online Supervised Learning","summary":" Developing efficient solutions for inference problems in intelligent sensor\nnetworks is crucial for the next generation of location, tracking, and mapping\nservices. This paper develops a scalable distributed probabilistic inference\nalgorithm that applies to continuous variables, intractable posteriors and\nlarge-scale real-time data in sensor networks. In a centralized setting,\nvariational inference is a fundamental technique for performing approximate\nBayesian estimation, in which an intractable posterior density is approximated\nwith a parametric density. Our key contribution lies in the derivation of a\nseparable lower bound on the centralized estimation objective, which enables\ndistributed variational inference with one-hop communication in a sensor\nnetwork. Our distributed evidence lower bound (DELBO) consists of a weighted\nsum of observation likelihood and divergence to prior densities, and its gap to\nthe measurement evidence is due to consensus and modeling errors. To solve\nbinary classification and regression problems while handling streaming data, we\ndesign an online distributed algorithm that maximizes DELBO, and specialize it\nto Gaussian variational densities with non-linear likelihoods. The resulting\ndistributed Gaussian variational inference (DGVI) efficiently inverts a\n$1$-rank correction to the covariance matrix. Finally, we derive a diagonalized\nversion for online distributed inference in high-dimensional models, and apply\nit to multi-robot probabilistic mapping using indoor LiDAR data.\n","authors":["Parth Paritosh","Nikolay Atanasov","Sonia Martinez"],"pdf_url":"https://arxiv.org/pdf/2309.02606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02604v1","updated":"2023-09-05T22:25:30Z","published":"2023-09-05T22:25:30Z","title":"Screening of Pneumonia and Urinary Tract Infection at Triage using\n TriNet","summary":" Due to the steady rise in population demographics and longevity, emergency\ndepartment visits are increasing across North America. As more patients visit\nthe emergency department, traditional clinical workflows become overloaded and\ninefficient, leading to prolonged wait-times and reduced healthcare quality.\nOne of such workflows is the triage medical directive, impeded by limited human\nworkload, inaccurate diagnoses and invasive over-testing. To address this\nissue, we propose TriNet: a machine learning model for medical directives that\nautomates first-line screening at triage for conditions requiring downstream\ntesting for diagnosis confirmation. To verify screening potential, TriNet was\ntrained on hospital triage data and achieved high positive predictive values in\ndetecting pneumonia (0.86) and urinary tract infection (0.93). These models\noutperform current clinical benchmarks, indicating that machine-learning\nmedical directives can offer cost-free, non-invasive screening with high\nspecificity for common conditions, reducing the risk of over-testing while\nincreasing emergency department efficiency.\n","authors":["Stephen Z. Lu"],"pdf_url":"https://arxiv.org/pdf/2309.02604v1.pdf","comment":"Index Terms: Downstream testing, Machine Learning, Medical\n directives, Modelling, Modular network, Pneumonia, Positive predictive value,\n Screening, Triage, Urinary tract infection"},{"id":"http://arxiv.org/abs/2307.04869v2","updated":"2023-09-05T22:06:24Z","published":"2023-07-10T19:32:53Z","title":"Fed-CPrompt: Contrastive Prompt for Rehearsal-Free Federated Continual\n Learning","summary":" Federated continual learning (FCL) learns incremental tasks over time from\nconfidential datasets distributed across clients. This paper focuses on\nrehearsal-free FCL, which has severe forgetting issues when learning new tasks\ndue to the lack of access to historical task data. To address this issue, we\npropose Fed-CPrompt based on prompt learning techniques to obtain task-specific\nprompts in a communication-efficient way. Fed-CPrompt introduces two key\ncomponents, asynchronous prompt learning, and contrastive continual loss, to\nhandle asynchronous task arrival and heterogeneous data distributions in FCL,\nrespectively. Extensive experiments demonstrate the effectiveness of\nFed-CPrompt in achieving SOTA rehearsal-free FCL performance.\n","authors":["Gaurav Bagwe","Xiaoyong Yuan","Miao Pan","Lan Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.04869v2.pdf","comment":"Accepted by FL-ICML 2023"},{"id":"http://arxiv.org/abs/2309.02596v1","updated":"2023-09-05T21:36:42Z","published":"2023-09-05T21:36:42Z","title":"Self-Supervised Pretraining Improves Performance and Inference\n Efficiency in Multiple Lung Ultrasound Interpretation Tasks","summary":" In this study, we investigated whether self-supervised pretraining could\nproduce a neural network feature extractor applicable to multiple\nclassification tasks in B-mode lung ultrasound analysis. When fine-tuning on\nthree lung ultrasound tasks, pretrained models resulted in an improvement of\nthe average across-task area under the receiver operating curve (AUC) by 0.032\nand 0.061 on local and external test sets respectively. Compact nonlinear\nclassifiers trained on features outputted by a single pretrained model did not\nimprove performance across all tasks; however, they did reduce inference time\nby 49% compared to serial execution of separate fine-tuned models. When\ntraining using 1% of the available labels, pretrained models consistently\noutperformed fully supervised models, with a maximum observed test AUC increase\nof 0.396 for the task of view classification. Overall, the results indicate\nthat self-supervised pretraining is useful for producing initial weights for\nlung ultrasound classifiers.\n","authors":["Blake VanBerlo","Brian Li","Jesse Hoey","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2309.02596v1.pdf","comment":"10 pages, 5 figures, submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2309.02591v1","updated":"2023-09-05T21:27:27Z","published":"2023-09-05T21:27:27Z","title":"Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction\n Tuning","summary":" We present CM3Leon (pronounced \"Chameleon\"), a retrieval-augmented,\ntoken-based, decoder-only multi-modal language model capable of generating and\ninfilling both text and images. CM3Leon uses the CM3 multi-modal architecture\nbut additionally shows the extreme benefits of scaling up and tuning on more\ndiverse instruction-style data. It is the first multi-modal model trained with\na recipe adapted from text-only language models, including a large-scale\nretrieval-augmented pre-training stage and a second multi-task supervised\nfine-tuning (SFT) stage. It is also a general-purpose model that can do both\ntext-to-image and image-to-text generation, allowing us to introduce\nself-contained contrastive decoding methods that produce high-quality outputs.\nExtensive experiments demonstrate that this recipe is highly effective for\nmulti-modal models. CM3Leon achieves state-of-the-art performance in\ntext-to-image generation with 5x less training compute than comparable methods\n(zero-shot MS-COCO FID of 4.88). After SFT, CM3Leon can also demonstrate\nunprecedented levels of controllability in tasks ranging from language-guided\nimage editing to image-controlled generation and segmentation.\n","authors":["Lili Yu","Bowen Shi","Ramakanth Pasunuru","Benjamin Muller","Olga Golovneva","Tianlu Wang","Arun Babu","Binh Tang","Brian Karrer","Shelly Sheynin","Candace Ross","Adam Polyak","Russell Howes","Vasu Sharma","Puxin Xu","Hovhannes Tamoyan","Oron Ashual","Uriel Singer","Shang-Wen Li","Susan Zhang","Richard James","Gargi Ghosh","Yaniv Taigman","Maryam Fazel-Zarandi","Asli Celikyilmaz","Luke Zettlemoyer","Armen Aghajanyan"],"pdf_url":"https://arxiv.org/pdf/2309.02591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08028v2","updated":"2023-09-05T21:22:38Z","published":"2023-03-02T00:55:46Z","title":"EdgeServe: A Streaming System for Decentralized Model Serving","summary":" The relevant features for a machine learning task may be aggregated from data\nsources collected on different nodes in a network. This problem, which we call\ndecentralized prediction, creates a number of interesting systems challenges in\nmanaging data routing, placing computation, and time-synchronization. This\npaper presents EdgeServe, a machine learning system that can serve\ndecentralized predictions. EdgeServe relies on a low-latency message broker to\nroute data through a network to nodes that can serve predictions. EdgeServe\nrelies on a series of novel optimizations that can tradeoff computation,\ncommunication, and accuracy. We evaluate EdgeServe on three decentralized\nprediction tasks: (1) multi-camera object tracking, (2) network intrusion\ndetection, and (3) human activity recognition.\n","authors":["Ted Shaowang","Sanjay Krishnan"],"pdf_url":"https://arxiv.org/pdf/2303.08028v2.pdf","comment":"13 pages, 12 figures; added experiments"},{"id":"http://arxiv.org/abs/2309.02583v1","updated":"2023-09-05T21:21:06Z","published":"2023-09-05T21:21:06Z","title":"Representation Learning for Sequential Volumetric Design Tasks","summary":" Volumetric design, also called massing design, is the first and critical step\nin professional building design which is sequential in nature. As the\nvolumetric design process is complex, the underlying sequential design process\nencodes valuable information for designers. Many efforts have been made to\nautomatically generate reasonable volumetric designs, but the quality of the\ngenerated design solutions varies, and evaluating a design solution requires\neither a prohibitively comprehensive set of metrics or expensive human\nexpertise. While previous approaches focused on learning only the final design\ninstead of sequential design tasks, we propose to encode the design knowledge\nfrom a collection of expert or high-performing design sequences and extract\nuseful representations using transformer-based models. Later we propose to\nutilize the learned representations for crucial downstream applications such as\ndesign preference evaluation and procedural design generation. We develop the\npreference model by estimating the density of the learned representations\nwhereas we train an autoregressive transformer model for sequential design\ngeneration. We demonstrate our ideas by leveraging a novel dataset of thousands\nof sequential volumetric designs. Our preference model can compare two\narbitrarily given design sequences and is almost 90% accurate in evaluation\nagainst random design sequences. Our autoregressive model is also capable of\nautocompleting a volumetric design sequence from a partial design sequence.\n","authors":["Md Ferdous Alam","Yi Wang","Linh Tran","Chin-Yi Cheng","Jieliang Luo"],"pdf_url":"https://arxiv.org/pdf/2309.02583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06651v2","updated":"2023-09-05T21:13:37Z","published":"2022-08-13T13:41:44Z","title":"Revisiting Adversarial Attacks on Graph Neural Networks for Graph\n Classification","summary":" Graph neural networks (GNNs) have achieved tremendous success in the task of\ngraph classification and its diverse downstream real-world applications.\nDespite the huge success in learning graph representations, current GNN models\nhave demonstrated their vulnerability to potentially existent adversarial\nexamples on graph-structured data. Existing approaches are either limited to\nstructure attacks or restricted to local information, urging for the design of\na more general attack framework on graph classification, which faces\nsignificant challenges due to the complexity of generating local-node-level\nadversarial examples using the global-graph-level information. To address this\n\"global-to-local\" attack challenge, we present a novel and general framework to\ngenerate adversarial examples via manipulating graph structure and node\nfeatures. Specifically, we make use of Graph Class Activation Mapping and its\nvariant to produce node-level importance corresponding to the graph\nclassification task. Then through a heuristic design of algorithms, we can\nperform both feature and structure attacks under unnoticeable perturbation\nbudgets with the help of both node-level and subgraph-level importance.\nExperiments towards attacking four state-of-the-art graph classification models\non six real-world benchmarks verify the flexibility and effectiveness of our\nframework.\n","authors":["Xin Wang","Heng Chang","Beini Xie","Tian Bian","Shiji Zhou","Daixin Wang","Zhiqiang Zhang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2208.06651v2.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2304.13145v2","updated":"2023-09-05T21:08:04Z","published":"2023-04-25T20:43:41Z","title":"T Cell Receptor Protein Sequences and Sparse Coding: A Novel Approach to\n Cancer Classification","summary":" Cancer is a complex disease characterized by uncontrolled cell growth and\nproliferation. T cell receptors (TCRs) are essential proteins for the adaptive\nimmune system, and their specific recognition of antigens plays a crucial role\nin the immune response against diseases, including cancer. The diversity and\nspecificity of TCRs make them ideal for targeting cancer cells, and recent\nadvancements in sequencing technologies have enabled the comprehensive\nprofiling of TCR repertoires. This has led to the discovery of TCRs with potent\nanti-cancer activity and the development of TCR-based immunotherapies. In this\nstudy, we investigate the use of sparse coding for the multi-class\nclassification of TCR protein sequences with cancer categories as target\nlabels. Sparse coding is a popular technique in machine learning that enables\nthe representation of data with a set of informative features and can capture\ncomplex relationships between amino acids and identify subtle patterns in the\nsequence that might be missed by low-dimensional methods. We first compute the\nk-mers from the TCR sequences and then apply sparse coding to capture the\nessential features of the data. To improve the predictive performance of the\nfinal embeddings, we integrate domain knowledge regarding different types of\ncancer properties. We then train different machine learning (linear and\nnon-linear) classifiers on the embeddings of TCR sequences for the purpose of\nsupervised analysis. Our proposed embedding method on a benchmark dataset of\nTCR sequences significantly outperforms the baselines in terms of predictive\nperformance, achieving an accuracy of 99.8\\%. Our study highlights the\npotential of sparse coding for the analysis of TCR protein sequences in cancer\nresearch and other related fields.\n","authors":["Zahra Tayebi","Sarwan Ali","Prakash Chourasia","Taslim Murad","Murray Patterson"],"pdf_url":"https://arxiv.org/pdf/2304.13145v2.pdf","comment":"Accepted at ICONIP 2023"},{"id":"http://arxiv.org/abs/2305.00048v2","updated":"2023-09-05T21:04:55Z","published":"2023-04-28T18:58:36Z","title":"Verification against in-situ observations for Data-Driven Weather\n Prediction","summary":" Data-driven weather prediction models (DDWPs) have made rapid strides in\nrecent years, demonstrating an ability to approximate Numerical Weather\nPrediction (NWP) models to a high degree of accuracy. The fast, accurate, and\nlow-cost DDWP forecasts make their use in operational forecasting an attractive\nproposition, however, there remains work to be done in rigorously evaluating\nDDWPs in a true operational setting. Typically trained and evaluated using ERA5\nreanalysis data, DDWPs have been tested only in a simulation, which cannot\nrepresent the real world with complete accuracy even if it is of a very high\nquality. The safe use of DDWPs in operational forecasting requires more\nthorough \"real-world\" verification, as well as a careful examination of how\nDDWPs are currently trained and evaluated. It is worth asking, for instance,\nhow well do the reanalysis datasets, used for training, simulate the real\nworld? With an eye towards climate justice and the uneven availability of\nweather data: is the simulation equally good for all regions of the world, and\nwould DDWPs exacerbate biases present in the training data? Does a good\nperformance in simulation correspond to good performance in operational\nsettings? In addition to approximating the physics of NWP models, how can ML be\nuniquely deployed to provide more accurate weather forecasts? As a first step\ntowards answering such questions, we present a robust dataset of in-situ\nobservations derived from the NOAA MADIS program to serve as a benchmark to\nvalidate DDWPs in an operational setting. By providing a large corpus of\nquality-controlled, in-situ observations, this dataset provides a meaningful\nreal-world task that all NWPs and DDWPs can be tested against. We hope that\nthis data can be used not only to rigorously and fairly compare operational\nweather models but also to spur future research in new directions.\n","authors":["Vivek Ramavajjala","Peetak P. Mitra"],"pdf_url":"https://arxiv.org/pdf/2305.00048v2.pdf","comment":"10 pages, 6 figures, under review at NeurIPS main conference"},{"id":"http://arxiv.org/abs/2309.02580v1","updated":"2023-09-05T21:03:36Z","published":"2023-09-05T21:03:36Z","title":"Unveiling Intractable Epileptogenic Brain Networks with Deep Learning\n Algorithms: A Novel and Comprehensive Framework for Scalable Seizure\n Prediction with Unimodal Neuroimaging Data in Pediatric Patients","summary":" Epilepsy is a prevalent neurological disorder affecting 50 million\nindividuals worldwide and 1.2 million Americans. There exist millions of\npediatric patients with intractable epilepsy, a condition in which seizures\nfail to come under control. The occurrence of seizures can result in physical\ninjury, disorientation, unconsciousness, and additional symptoms that could\nimpede children's ability to participate in everyday tasks. Predicting seizures\ncan help parents and healthcare providers take precautions, prevent risky\nsituations, and mentally prepare children to minimize anxiety and nervousness\nassociated with the uncertainty of a seizure. This research proposes a novel\nand comprehensive framework to predict seizures in pediatric patients by\nevaluating machine learning algorithms on unimodal neuroimaging data consisting\nof electroencephalogram signals. The bandpass filtering and independent\ncomponent analysis proved to be effective in reducing the noise and artifacts\nfrom the dataset. Various machine learning algorithms' performance is evaluated\non important metrics such as accuracy, precision, specificity, sensitivity, F1\nscore and MCC. The results show that the deep learning algorithms are more\nsuccessful in predicting seizures than logistic Regression, and k nearest\nneighbors. The recurrent neural network (RNN) gave the highest precision and F1\nScore, long short-term memory (LSTM) outperformed RNN in accuracy and\nconvolutional neural network (CNN) resulted in the highest Specificity. This\nresearch has significant implications for healthcare providers in proactively\nmanaging seizure occurrence in pediatric patients, potentially transforming\nclinical practices, and improving pediatric care.\n","authors":["Bliss Singhal","Fnu Pooja"],"pdf_url":"https://arxiv.org/pdf/2309.02580v1.pdf","comment":"9 pages, 15 figures"},{"id":"http://arxiv.org/abs/2309.02578v1","updated":"2023-09-05T20:58:15Z","published":"2023-09-05T20:58:15Z","title":"Anatomy-Driven Pathology Detection on Chest X-rays","summary":" Pathology detection and delineation enables the automatic interpretation of\nmedical scans such as chest X-rays while providing a high level of\nexplainability to support radiologists in making informed decisions. However,\nannotating pathology bounding boxes is a time-consuming task such that large\npublic datasets for this purpose are scarce. Current approaches thus use weakly\nsupervised object detection to learn the (rough) localization of pathologies\nfrom image-level annotations, which is however limited in performance due to\nthe lack of bounding box supervision. We therefore propose anatomy-driven\npathology detection (ADPD), which uses easy-to-annotate bounding boxes of\nanatomical regions as proxies for pathologies. We study two training\napproaches: supervised training using anatomy-level pathology labels and\nmultiple instance learning (MIL) with image-level pathology labels. Our results\nshow that our anatomy-level training approach outperforms weakly supervised\nmethods and fully supervised detection with limited training samples, and our\nMIL approach is competitive with both baseline approaches, therefore\ndemonstrating the potential of our approach.\n","authors":["Philip Müller","Felix Meissen","Johannes Brandt","Georgios Kaissis","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2309.02578v1.pdf","comment":"Accepted at MICCAI 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.02401v1","updated":"2023-09-05T17:27:16Z","published":"2023-09-05T17:27:16Z","title":"Prototype-based Dataset Comparison","summary":" Dataset summarisation is a fruitful approach to dataset inspection. However,\nwhen applied to a single dataset the discovery of visual concepts is restricted\nto those most prominent. We argue that a comparative approach can expand upon\nthis paradigm to enable richer forms of dataset inspection that go beyond the\nmost prominent concepts. To enable dataset comparison we present a module that\nlearns concept-level prototypes across datasets. We leverage self-supervised\nlearning to discover these prototypes without supervision, and we demonstrate\nthe benefits of our approach in two case-studies. Our findings show that\ndataset comparison extends dataset inspection and we hope to encourage more\nworks in this direction. Code and usage instructions available at\nhttps://github.com/Nanne/ProtoSim\n","authors":["Nanne van Noord"],"pdf_url":"https://arxiv.org/pdf/2309.02401v1.pdf","comment":"To be presented at ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02385v1","updated":"2023-09-05T16:56:53Z","published":"2023-09-05T16:56:53Z","title":"Hybrid Design of Multiplicative Watermarking for Defense Against\n Malicious Parameter Identification","summary":" Watermarking is a promising active diagnosis technique for detection of\nhighly sophisticated attacks, but is vulnerable to malicious agents that use\neavesdropped data to identify and then remove or replicate the watermark. In\nthis work, we propose a hybrid multiplicative watermarking (HMWM) scheme, where\nthe watermark parameters are periodically updated, following the dynamics of\nthe unobservable states of specifically designed piecewise affine (PWA) hybrid\nsystems. We provide a theoretical analysis of the effects of this scheme on the\nclosed-loop performance, and prove that stability properties are preserved.\nAdditionally, we show that the proposed approach makes it difficult for an\neavesdropper to reconstruct the watermarking parameters, both in terms of the\nassociated computational complexity and from a systems theoretic perspective.\n","authors":["Jiaxuan Zhang","Alexander J. Gallo","Riccardo M. G. Ferrari"],"pdf_url":"https://arxiv.org/pdf/2309.02385v1.pdf","comment":"8 pages, first submission to the 62nd IEEE Conference on Decision and\n Control"},{"id":"http://arxiv.org/abs/2309.02136v1","updated":"2023-09-05T11:27:16Z","published":"2023-09-05T11:27:16Z","title":"Exploring the Intersection of Complex Aesthetics and Generative AI for\n Promoting Cultural Creativity in Rural China after the Post-Pandemic Era","summary":" This paper explores using generative AI and aesthetics to promote cultural\ncreativity in rural China amidst COVID-19's impact. Through literature reviews,\ncase studies, surveys, and text analysis, it examines art and technology\napplications in rural contexts and identifies key challenges. The study finds\nartworks often fail to resonate locally, while reliance on external artists\nlimits sustainability. Hence, nurturing grassroots \"artist villagers\" through\nAI is proposed. Our approach involves training machine learning on subjective\naesthetics to generate culturally relevant content. Interactive AI media can\nalso boost tourism while preserving heritage. This pioneering research puts\nforth original perspectives on the intersection of AI and aesthetics to\ninvigorate rural culture. It advocates holistic integration of technology and\nemphasizes AI's potential as a creative enabler versus replacement. Ultimately,\nit lays the groundwork for further exploration of leveraging AI innovations to\nempower rural communities. This timely study contributes to growing interest in\nemerging technologies to address critical issues facing rural China.\n","authors":["Mengyao Guo","Xiaolin Zhang","Yuan Zhuang","Jing Chen","Pengfei Wang","Ze Gao"],"pdf_url":"https://arxiv.org/pdf/2309.02136v1.pdf","comment":"Accepted by 2023 the 1st International Conference on AI-generated\n Content (AIGC2023)"},{"id":"http://arxiv.org/abs/2309.02099v1","updated":"2023-09-05T10:08:11Z","published":"2023-09-05T10:08:11Z","title":"Towards Diverse and Consistent Typography Generation","summary":" In this work, we consider the typography generation task that aims at\nproducing diverse typographic styling for the given graphic document. We\nformulate typography generation as a fine-grained attribute generation for\nmultiple text elements and build an autoregressive model to generate diverse\ntypography that matches the input design context. We further propose a simple\nyet effective sampling approach that respects the consistency and distinction\nprinciple of typography so that generated examples share consistent typographic\nstyling across text elements. Our empirical study shows that our model\nsuccessfully generates diverse typographic designs while preserving a\nconsistent typographic structure.\n","authors":["Wataru Shimoda","Daichi Haraguchi","Seiichi Uchida","Kota Yamaguchi"],"pdf_url":"https://arxiv.org/pdf/2309.02099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01955v1","updated":"2023-09-05T05:06:48Z","published":"2023-09-05T05:06:48Z","title":"A Survey on Interpretable Cross-modal Reasoning","summary":" In recent years, cross-modal reasoning (CMR), the process of understanding\nand reasoning across different modalities, has emerged as a pivotal area with\napplications spanning from multimedia analysis to healthcare diagnostics. As\nthe deployment of AI systems becomes more ubiquitous, the demand for\ntransparency and comprehensibility in these systems' decision-making processes\nhas intensified. This survey delves into the realm of interpretable cross-modal\nreasoning (I-CMR), where the objective is not only to achieve high predictive\nperformance but also to provide human-understandable explanations for the\nresults. This survey presents a comprehensive overview of the typical methods\nwith a three-level taxonomy for I-CMR. Furthermore, this survey reviews the\nexisting CMR datasets with annotations for explanations. Finally, this survey\nsummarizes the challenges for I-CMR and discusses potential future directions.\nIn conclusion, this survey aims to catalyze the progress of this emerging\nresearch area by providing researchers with a panoramic and comprehensive\nperspective, illuminating the state of the art and discerning the\nopportunities.\n","authors":["Dizhan Xue","Shengsheng Qian","Zuyi Zhou","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2309.01955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01875v1","updated":"2023-09-05T00:58:17Z","published":"2023-09-05T00:58:17Z","title":"Gradient Domain Diffusion Models for Image Synthesis","summary":" Diffusion models are getting popular in generative image and video synthesis.\nHowever, due to the diffusion process, they require a large number of steps to\nconverge. To tackle this issue, in this paper, we propose to perform the\ndiffusion process in the gradient domain, where the convergence becomes faster.\nThere are two reasons. First, thanks to the Poisson equation, the gradient\ndomain is mathematically equivalent to the original image domain. Therefore,\neach diffusion step in the image domain has a unique corresponding gradient\ndomain representation. Second, the gradient domain is much sparser than the\nimage domain. As a result, gradient domain diffusion models converge faster.\nSeveral numerical experiments confirm that the gradient domain diffusion models\nare more efficient than the original diffusion models. The proposed method can\nbe applied in a wide range of applications such as image processing, computer\nvision and machine learning tasks.\n","authors":["Yuanhao Gong"],"pdf_url":"https://arxiv.org/pdf/2309.01875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02567v1","updated":"2023-09-05T20:27:31Z","published":"2023-09-05T20:27:31Z","title":"Symbolic Music Representations for Classification Tasks: A Systematic\n Evaluation","summary":" Music Information Retrieval (MIR) has seen a recent surge in deep\nlearning-based approaches, which often involve encoding symbolic music (i.e.,\nmusic represented in terms of discrete note events) in an image-like or\nlanguage like fashion. However, symbolic music is neither an image nor a\nsentence, and research in the symbolic domain lacks a comprehensive overview of\nthe different available representations. In this paper, we investigate matrix\n(piano roll), sequence, and graph representations and their corresponding\nneural architectures, in combination with symbolic scores and performances on\nthree piece-level classification tasks. We also introduce a novel graph\nrepresentation for symbolic performances and explore the capability of graph\nrepresentations in global classification tasks. Our systematic evaluation shows\nadvantages and limitations of each input representation. Our results suggest\nthat the graph representation, as the newest and least explored among the three\napproaches, exhibits promising performance, while being more light-weight in\ntraining.\n","authors":["Huan Zhang","Emmanouil Karystinaios","Simon Dixon","Gerhard Widmer","Carlos Eduardo Cancino-Chacón"],"pdf_url":"https://arxiv.org/pdf/2309.02567v1.pdf","comment":null}]},"2023-09-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.01860v1","updated":"2023-09-04T23:31:29Z","published":"2023-09-04T23:31:29Z","title":"Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition\n and Translation","summary":" In this paper, we devise a mechanism for the addition of multi-modal\ninformation with an existing pipeline for continuous sign language recognition\nand translation. In our procedure, we have incorporated optical flow\ninformation with RGB images to enrich the features with movement-related\ninformation. This work studies the feasibility of such modality inclusion using\na cross-modal encoder. The plugin we have used is very lightweight and doesn't\nneed to include a separate feature extractor for the new modality in an\nend-to-end manner. We have applied the changes in both sign language\nrecognition and translation, improving the result in each case. We have\nevaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language\nrecognition and the RWTH-PHOENIX-2014T dataset for translation. On the\nrecognition task, our approach reduced the WER by 0.9, and on the translation\ntask, our approach increased most of the BLEU scores by ~0.6 on the test set.\n","authors":["Zaber Ibn Abdul Hakim","Rasman Mubtasim Swargo","Muhammad Abdullah Adnan"],"pdf_url":"https://arxiv.org/pdf/2309.01860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01826v1","updated":"2023-09-04T21:30:21Z","published":"2023-09-04T21:30:21Z","title":"One Wide Feedforward is All You Need","summary":" The Transformer architecture has two main non-embedding components: Attention\nand the Feed Forward Network (FFN). Attention captures interdependencies\nbetween words regardless of their position, while the FFN non-linearly\ntransforms each input token independently. In this work we explore the role of\nthe FFN, and find that despite taking up a significant fraction of the model's\nparameters, it is highly redundant. Concretely, we are able to substantially\nreduce the number of parameters with only a modest drop in accuracy by removing\nthe FFN on the decoder layers and sharing a single FFN across the encoder.\nFinally we scale this architecture back to its original size by increasing the\nhidden dimension of the shared FFN, achieving substantial gains in both\naccuracy and latency with respect to the original Transformer Big.\n","authors":["Telmo Pessoa Pires","António V. Lopes","Yannick Assogba","Hendra Setiawan"],"pdf_url":"https://arxiv.org/pdf/2309.01826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13315v2","updated":"2023-09-04T21:03:51Z","published":"2023-08-25T11:37:56Z","title":"Construction Grammar and Language Models","summary":" Recent progress in deep learning and natural language processing has given\nrise to powerful models that are primarily trained on a cloze-like task and\nshow some evidence of having access to substantial linguistic information,\nincluding some constructional knowledge. This groundbreaking discovery presents\nan exciting opportunity for a synergistic relationship between computational\nmethods and Construction Grammar research. In this chapter, we explore three\ndistinct approaches to the interplay between computational methods and\nConstruction Grammar: (i) computational methods for text analysis, (ii)\ncomputational Construction Grammar, and (iii) deep learning models, with a\nparticular focus on language models. We touch upon the first two approaches as\na contextual foundation for the use of computational methods before providing\nan accessible, yet comprehensive overview of deep learning models, which also\naddresses reservations construction grammarians may have. Additionally, we\ndelve into experiments that explore the emergence of constructionally relevant\ninformation within these models while also examining the aspects of\nConstruction Grammar that may pose challenges for these models. This chapter\naims to foster collaboration between researchers in the fields of natural\nlanguage processing and Construction Grammar. By doing so, we hope to pave the\nway for new insights and advancements in both these fields.\n","authors":["Harish Tayyar Madabushi","Laurence Romain","Petar Milin","Dagmar Divjak"],"pdf_url":"https://arxiv.org/pdf/2308.13315v2.pdf","comment":"Accepted for publication in The Cambridge Handbook of Construction\n Grammar, edited by Mirjam Fried and Kiki Nikiforidou. To appear in 2024"},{"id":"http://arxiv.org/abs/2309.01812v1","updated":"2023-09-04T21:02:36Z","published":"2023-09-04T21:02:36Z","title":"Into the Single Cell Multiverse: an End-to-End Dataset for Procedural\n Knowledge Extraction in Biomedical Texts","summary":" Many of the most commonly explored natural language processing (NLP)\ninformation extraction tasks can be thought of as evaluations of declarative\nknowledge, or fact-based information extraction. Procedural knowledge\nextraction, i.e., breaking down a described process into a series of steps, has\nreceived much less attention, perhaps in part due to the lack of structured\ndatasets that capture the knowledge extraction process from end-to-end. To\naddress this unmet need, we present FlaMB\\'e (Flow annotations for Multiverse\nBiological entities), a collection of expert-curated datasets across a series\nof complementary tasks that capture procedural knowledge in biomedical texts.\nThis dataset is inspired by the observation that one ubiquitous source of\nprocedural knowledge that is described as unstructured text is within academic\npapers describing their methodology. The workflows annotated in FlaMB\\'e are\nfrom texts in the burgeoning field of single cell research, a research area\nthat has become notorious for the number of software tools and complexity of\nworkflows used. Additionally, FlaMB\\'e provides, to our knowledge, the largest\nmanually curated named entity recognition (NER) and disambiguation (NED)\ndatasets for tissue/cell type, a fundamental biological entity that is critical\nfor knowledge extraction in the biomedical research domain. Beyond providing a\nvaluable dataset to enable further development of NLP models for procedural\nknowledge extraction, automating the process of workflow mining also has\nimportant implications for advancing reproducibility in biomedical research.\n","authors":["Ruth Dannenfelser","Jeffrey Zhong","Ran Zhang","Vicky Yao"],"pdf_url":"https://arxiv.org/pdf/2309.01812v1.pdf","comment":"Submitted to NeurIPS 2023 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2309.01809v1","updated":"2023-09-04T20:54:11Z","published":"2023-09-04T20:54:11Z","title":"Are Emergent Abilities in Large Language Models just In-Context\n Learning?","summary":" Large language models have exhibited emergent abilities, demonstrating\nexceptional performance across diverse tasks for which they were not explicitly\ntrained, including those that require complex reasoning abilities. The\nemergence of such abilities carries profound implications for the future\ndirection of research in NLP, especially as the deployment of such models\nbecomes more prevalent. However, one key challenge is that the evaluation of\nthese abilities is often confounded by competencies that arise in models\nthrough alternative prompting techniques, such as in-context learning and\ninstruction following, which also emerge as the models are scaled up. In this\nstudy, we provide the first comprehensive examination of these emergent\nabilities while accounting for various potentially biasing factors that can\ninfluence the evaluation of models. We conduct rigorous tests on a set of 18\nmodels, encompassing a parameter range from 60 million to 175 billion\nparameters, across a comprehensive set of 22 tasks. Through an extensive series\nof over 1,000 experiments, we provide compelling evidence that emergent\nabilities can primarily be ascribed to in-context learning. We find no evidence\nfor the emergence of reasoning abilities, thus providing valuable insights into\nthe underlying mechanisms driving the observed abilities and thus alleviating\nsafety concerns regarding their use.\n","authors":["Sheng Lu","Irina Bigoulaeva","Rachneet Sachdeva","Harish Tayyar Madabushi","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2309.01809v1.pdf","comment":"Code available at https://github.com/UKPLab/on-emergence and data\n available at https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/3931"},{"id":"http://arxiv.org/abs/2308.15459v2","updated":"2023-09-04T20:34:03Z","published":"2023-08-29T17:36:02Z","title":"ParaGuide: Guided Diffusion Paraphrasers for Plug-and-Play Textual Style\n Transfer","summary":" Textual style transfer is the task of transforming stylistic properties of\ntext while preserving meaning. Target \"styles\" can be defined in numerous ways,\nranging from single attributes (e.g, formality) to authorship (e.g,\nShakespeare). Previous unsupervised style-transfer approaches generally rely on\nsignificant amounts of labeled data for only a fixed set of styles or require\nlarge language models. In contrast, we introduce a novel diffusion-based\nframework for general-purpose style transfer that can be flexibly adapted to\narbitrary target styles at inference time. Our parameter-efficient approach,\nParaGuide, leverages paraphrase-conditioned diffusion models alongside\ngradient-based guidance from both off-the-shelf classifiers and strong existing\nstyle embedders to transform the style of text while preserving semantic\ninformation. We validate the method on the Enron Email Corpus, with both human\nand automatic evaluations, and find that it outperforms strong baselines on\nformality, sentiment, and even authorship style transfer.\n","authors":["Zachary Horvitz","Ajay Patel","Chris Callison-Burch","Zhou Yu","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2308.15459v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01740v1","updated":"2023-09-04T17:58:01Z","published":"2023-09-04T17:58:01Z","title":"An Empirical Analysis for Zero-Shot Multi-Label Classification on\n COVID-19 CT Scans and Uncurated Reports","summary":" The pandemic resulted in vast repositories of unstructured data, including\nradiology reports, due to increased medical examinations. Previous research on\nautomated diagnosis of COVID-19 primarily focuses on X-ray images, despite\ntheir lower precision compared to computed tomography (CT) scans. In this work,\nwe leverage unstructured data from a hospital and harness the fine-grained\ndetails offered by CT scans to perform zero-shot multi-label classification\nbased on contrastive visual language learning. In collaboration with human\nexperts, we investigate the effectiveness of multiple zero-shot models that aid\nradiologists in detecting pulmonary embolisms and identifying intricate lung\ndetails like ground glass opacities and consolidations. Our empirical analysis\nprovides an overview of the possible solutions to target such fine-grained\ntasks, so far overlooked in the medical multimodal pretraining literature. Our\ninvestigation promises future advancements in the medical image analysis\ncommunity by addressing some challenges associated with unstructured data and\nfine-grained multi-label classification.\n","authors":["Ethan Dack","Lorenzo Brigato","Matthew McMurray","Matthias Fontanellaz","Thomas Frauenfelder","Hanno Hoppe","Aristomenis Exadaktylos","Thomas Geiser","Manuela Funke-Chambour","Andreas Christe","Lukas Ebner","Stavroula Mougiakakou"],"pdf_url":"https://arxiv.org/pdf/2309.01740v1.pdf","comment":"10 pages, 3 figures, Proceedings of the IEEE/CVF International\n Conference on Computer Vision (ICCV) Workshops 2023"},{"id":"http://arxiv.org/abs/2309.00614v2","updated":"2023-09-04T17:47:36Z","published":"2023-09-01T17:59:44Z","title":"Baseline Defenses for Adversarial Attacks Against Aligned Language\n Models","summary":" As Large Language Models quickly become ubiquitous, it becomes critical to\nunderstand their security vulnerabilities. Recent work shows that text\noptimizers can produce jailbreaking prompts that bypass moderation and\nalignment. Drawing from the rich body of work on adversarial machine learning,\nwe approach these attacks with three questions: What threat models are\npractically useful in this domain? How do baseline defense techniques perform\nin this new domain? How does LLM security differ from computer vision?\n We evaluate several baseline defense strategies against leading adversarial\nattacks on LLMs, discussing the various settings in which each is feasible and\neffective. Particularly, we look at three types of defenses: detection\n(perplexity based), input preprocessing (paraphrase and retokenization), and\nadversarial training. We discuss white-box and gray-box settings and discuss\nthe robustness-performance trade-off for each of the defenses considered. We\nfind that the weakness of existing discrete optimizers for text, combined with\nthe relatively high costs of optimization, makes standard adaptive attacks more\nchallenging for LLMs. Future research will be needed to uncover whether more\npowerful optimizers can be developed, or whether the strength of filtering and\npreprocessing defenses is greater in the LLMs domain than it has been in\ncomputer vision.\n","authors":["Neel Jain","Avi Schwarzschild","Yuxin Wen","Gowthami Somepalli","John Kirchenbauer","Ping-yeh Chiang","Micah Goldblum","Aniruddha Saha","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2309.00614v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2309.01717v1","updated":"2023-09-04T16:54:49Z","published":"2023-09-04T16:54:49Z","title":"Interdisciplinary Fairness in Imbalanced Research Proposal Topic\n Inference: A Hierarchical Transformer-based Method with Selective\n Interpolation","summary":" The objective of topic inference in research proposals aims to obtain the\nmost suitable disciplinary division from the discipline system defined by a\nfunding agency. The agency will subsequently find appropriate peer review\nexperts from their database based on this division. Automated topic inference\ncan reduce human errors caused by manual topic filling, bridge the knowledge\ngap between funding agencies and project applicants, and improve system\nefficiency. Existing methods focus on modeling this as a hierarchical\nmulti-label classification problem, using generative models to iteratively\ninfer the most appropriate topic information. However, these methods overlook\nthe gap in scale between interdisciplinary research proposals and\nnon-interdisciplinary ones, leading to an unjust phenomenon where the automated\ninference system categorizes interdisciplinary proposals as\nnon-interdisciplinary, causing unfairness during the expert assignment. How can\nwe address this data imbalance issue under a complex discipline system and\nhence resolve this unfairness? In this paper, we implement a topic label\ninference system based on a Transformer encoder-decoder architecture.\nFurthermore, we utilize interpolation techniques to create a series of\npseudo-interdisciplinary proposals from non-interdisciplinary ones during\ntraining based on non-parametric indicators such as cross-topic probabilities\nand topic occurrence probabilities. This approach aims to reduce the bias of\nthe system during model training. Finally, we conduct extensive experiments on\na real-world dataset to verify the effectiveness of the proposed method. The\nexperimental results demonstrate that our training strategy can significantly\nmitigate the unfairness generated in the topic inference task.\n","authors":["Meng Xiao","Min Wu","Ziyue Qiao","Yanjie Fu","Zhiyuan Ning","Yi Du","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.01717v1.pdf","comment":"19 pages, Under review. arXiv admin note: text overlap with\n arXiv:2209.13912"},{"id":"http://arxiv.org/abs/2309.01715v1","updated":"2023-09-04T16:53:17Z","published":"2023-09-04T16:53:17Z","title":"Prompting or Fine-tuning? A Comparative Study of Large Language Models\n for Taxonomy Construction","summary":" Taxonomies represent hierarchical relations between entities, frequently\napplied in various software modeling and natural language processing (NLP)\nactivities. They are typically subject to a set of structural constraints\nrestricting their content. However, manual taxonomy construction can be\ntime-consuming, incomplete, and costly to maintain. Recent studies of large\nlanguage models (LLMs) have demonstrated that appropriate user inputs (called\nprompting) can effectively guide LLMs, such as GPT-3, in diverse NLP tasks\nwithout explicit (re-)training. However, existing approaches for automated\ntaxonomy construction typically involve fine-tuning a language model by\nadjusting model parameters. In this paper, we present a general framework for\ntaxonomy construction that takes into account structural constraints. We\nsubsequently conduct a systematic comparison between the prompting and\nfine-tuning approaches performed on a hypernym taxonomy and a novel computer\nscience taxonomy dataset. Our result reveals the following: (1) Even without\nexplicit training on the dataset, the prompting approach outperforms\nfine-tuning-based approaches. Moreover, the performance gap between prompting\nand fine-tuning widens when the training dataset is small. However, (2)\ntaxonomies generated by the fine-tuning approach can be easily post-processed\nto satisfy all the constraints, whereas handling violations of the taxonomies\nproduced by the prompting approach can be challenging. These evaluation\nfindings provide guidance on selecting the appropriate method for taxonomy\nconstruction and highlight potential enhancements for both approaches.\n","authors":["Boqi Chen","Fandi Yi","Dániel Varró"],"pdf_url":"https://arxiv.org/pdf/2309.01715v1.pdf","comment":"Accepted by MDE Intelligence 2023"},{"id":"http://arxiv.org/abs/2309.01686v1","updated":"2023-09-04T16:02:23Z","published":"2023-09-04T16:02:23Z","title":"MathAttack: Attacking Large Language Models Towards Math Solving Ability","summary":" With the boom of Large Language Models (LLMs), the research of solving Math\nWord Problem (MWP) has recently made great progress. However, there are few\nstudies to examine the security of LLMs in math solving ability. Instead of\nattacking prompts in the use of LLMs, we propose a MathAttack model to attack\nMWP samples which are closer to the essence of security in solving math\nproblems. Compared to traditional text adversarial attack, it is essential to\npreserve the mathematical logic of original MWPs during the attacking. To this\nend, we propose logical entity recognition to identify logical entries which\nare then frozen. Subsequently, the remaining text are attacked by adopting a\nword-level attacker. Furthermore, we propose a new dataset RobustMath to\nevaluate the robustness of LLMs in math solving ability. Extensive experiments\non our RobustMath and two another math benchmark datasets GSM8K and MultiAirth\nshow that MathAttack could effectively attack the math solving ability of LLMs.\nIn the experiments, we observe that (1) Our adversarial samples from\nhigher-accuracy LLMs are also effective for attacking LLMs with lower accuracy\n(e.g., transfer from larger to smaller-size LLMs, or from few-shot to zero-shot\nprompts); (2) Complex MWPs (such as more solving steps, longer text, more\nnumbers) are more vulnerable to attack; (3) We can improve the robustness of\nLLMs by using our adversarial samples in few-shot prompts. Finally, we hope our\npractice and observation can serve as an important attempt towards enhancing\nthe robustness of LLMs in math solving ability. We will release our code and\ndataset.\n","authors":["Zihao Zhou","Qiufeng Wang","Mingyu Jin","Jie Yao","Jianan Ye","Wei Liu","Wei Wang","Xiaowei Huang","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2309.01686v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.01684v1","updated":"2023-09-04T15:58:43Z","published":"2023-09-04T15:58:43Z","title":"CRUISE-Screening: Living Literature Reviews Toolbox","summary":" Keeping up with research and finding related work is still a time-consuming\ntask for academics. Researchers sift through thousands of studies to identify a\nfew relevant ones. Automation techniques can help by increasing the efficiency\nand effectiveness of this task. To this end, we developed CRUISE-Screening, a\nweb-based application for conducting living literature reviews - a type of\nliterature review that is continuously updated to reflect the latest research\nin a particular field. CRUISE-Screening is connected to several search engines\nvia an API, which allows for updating the search results periodically.\nMoreover, it can facilitate the process of screening for relevant publications\nby using text classification and question answering models. CRUISE-Screening\ncan be used both by researchers conducting literature reviews and by those\nworking on automating the citation screening process to validate their\nalgorithms. The application is open-source:\nhttps://github.com/ProjectDoSSIER/cruise-screening, and a demo is available\nunder this URL: https://citation-screening.ec.tuwien.ac.at. We discuss the\nlimitations of our tool in Appendix A.\n","authors":["Wojciech Kusa","Petr Knoth","Allan Hanbury"],"pdf_url":"https://arxiv.org/pdf/2309.01684v1.pdf","comment":"Paper accepted at CIKM 2023. The arXiv version has an extra section\n about limitations in the Appendix that is not present in the ACM version"},{"id":"http://arxiv.org/abs/2309.01669v1","updated":"2023-09-04T15:34:02Z","published":"2023-09-04T15:34:02Z","title":"Donkii: Can Annotation Error Detection Methods Find Errors in\n Instruction-Tuning Datasets?","summary":" Instruction-tuning has become an integral part of training pipelines for\nLarge Language Models (LLMs) and has been shown to yield strong performance\ngains. In an orthogonal line of research, Annotation Error Detection (AED) has\nemerged as a tool for detecting quality issues of gold-standard labels. But so\nfar, the application of AED methods is limited to discriminative settings. It\nis an open question how well AED methods generalize to generative settings\nwhich are becoming widespread via generative LLMs. In this work, we present a\nfirst and new benchmark for AED on instruction-tuning data: Donkii. It\nencompasses three instruction-tuning datasets enriched with annotations by\nexperts and semi-automatic methods. We find that all three datasets contain\nclear-cut errors that sometimes directly propagate into instruction-tuned LLMs.\nWe propose four AED baselines for the generative setting and evaluate them\ncomprehensively on the newly introduced dataset. Our results demonstrate that\nchoosing the right AED method and model size is indeed crucial, thereby\nderiving practical recommendations. To gain insights, we provide a first\ncase-study to examine how the quality of the instruction-tuning datasets\ninfluences downstream performance.\n","authors":["Leon Weber-Genzel","Robert Litschko","Ekaterina Artemova","Barbara Plank"],"pdf_url":"https://arxiv.org/pdf/2309.01669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01664v1","updated":"2023-09-04T15:32:47Z","published":"2023-09-04T15:32:47Z","title":"Fine-grained Affective Processing Capabilities Emerging from Large\n Language Models","summary":" Large language models, in particular generative pre-trained transformers\n(GPTs), show impressive results on a wide variety of language-related tasks. In\nthis paper, we explore ChatGPT's zero-shot ability to perform affective\ncomputing tasks using prompting alone. We show that ChatGPT a) performs\nmeaningful sentiment analysis in the Valence, Arousal and Dominance dimensions,\nb) has meaningful emotion representations in terms of emotion categories and\nthese affective dimensions, and c) can perform basic appraisal-based emotion\nelicitation of situations based on a prompt-based computational implementation\nof the OCC appraisal model. These findings are highly relevant: First, they\nshow that the ability to solve complex affect processing tasks emerges from\nlanguage-based token prediction trained on extensive data sets. Second, they\nshow the potential of large language models for simulating, processing and\nanalyzing human emotions, which has important implications for various\napplications such as sentiment analysis, socially interactive agents, and\nsocial robotics.\n","authors":["Joost Broekens","Bernhard Hilpert","Suzan Verberne","Kim Baraka","Patrick Gebhard","Aske Plaat"],"pdf_url":"https://arxiv.org/pdf/2309.01664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01660v1","updated":"2023-09-04T15:26:15Z","published":"2023-09-04T15:26:15Z","title":"Unveiling Theory of Mind in Large Language Models: A Parallel to Single\n Neurons in the Human Brain","summary":" With their recent development, large language models (LLMs) have been found\nto exhibit a certain level of Theory of Mind (ToM), a complex cognitive\ncapacity that is related to our conscious mind and that allows us to infer\nanother's beliefs and perspective. While human ToM capabilities are believed to\nderive from the neural activity of a broadly interconnected brain network,\nincluding that of dorsal medial prefrontal cortex (dmPFC) neurons, the precise\nprocesses underlying LLM's capacity for ToM or their similarities with that of\nhumans remains largely unknown. In this study, we drew inspiration from the\ndmPFC neurons subserving human ToM and employed a similar methodology to\nexamine whether LLMs exhibit comparable characteristics. Surprisingly, our\nanalysis revealed a striking resemblance between the two, as hidden embeddings\n(artificial neurons) within LLMs started to exhibit significant responsiveness\nto either true- or false-belief trials, suggesting their ability to represent\nanother's perspective. These artificial embedding responses were closely\ncorrelated with the LLMs' performance during the ToM tasks, a property that was\ndependent on the size of the models. Further, the other's beliefs could be\naccurately decoded using the entire embeddings, indicating the presence of the\nembeddings' ToM capability at the population level. Together, our findings\nrevealed an emergent property of LLMs' embeddings that modified their\nactivities in response to ToM features, offering initial evidence of a parallel\nbetween the artificial model and neurons in the human brain.\n","authors":["Mohsen Jamali","Ziv M. Williams","Jing Cai"],"pdf_url":"https://arxiv.org/pdf/2309.01660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01659v1","updated":"2023-09-04T15:21:55Z","published":"2023-09-04T15:21:55Z","title":"Evolving linguistic divergence on polarizing social media","summary":" Language change is influenced by many factors, but often starts from\nsynchronic variation, where multiple linguistic patterns or forms coexist, or\nwhere different speech communities use language in increasingly different ways.\nBesides regional or economic reasons, communities may form and segregate based\non political alignment. The latter, referred to as political polarization, is\nof growing societal concern across the world. Here we map and quantify\nlinguistic divergence across the partisan left-right divide in the United\nStates, using social media data. We develop a general methodology to delineate\n(social) media users by their political preference, based on which (potentially\nbiased) news media accounts they do and do not follow on a given platform. Our\ndata consists of 1.5M short posts by 10k users (about 20M words) from the\nsocial media platform Twitter (now \"X\"). Delineating this sample involved\nmining the platform for the lists of followers (n=422M) of 72 large news media\naccounts. We quantify divergence in topics of conversation and word\nfrequencies, messaging sentiment, and lexical semantics of words and emoji. We\nfind signs of linguistic divergence across all these aspects, especially in\ntopics and themes of conversation, in line with previous research. While US\nAmerican English remains largely intelligible within its large speech\ncommunity, our findings point at areas where miscommunication may eventually\narise given ongoing polarization and therefore potential linguistic divergence.\nOur methodology - combining data mining, lexicostatistics, machine learning,\nlarge language models and a systematic human annotation approach - is largely\nlanguage and platform agnostic. In other words, while we focus here on US\npolitical divides and US English, the same approach is applicable to other\ncountries, languages, and social media platforms.\n","authors":["Andres Karjus","Christine Cuskley"],"pdf_url":"https://arxiv.org/pdf/2309.01659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16890v2","updated":"2023-09-04T15:06:15Z","published":"2023-08-31T17:52:04Z","title":"TouchStone: Evaluating Vision-Language Models by Language Models","summary":" Large vision-language models (LVLMs) have recently witnessed rapid\nadvancements, exhibiting a remarkable capacity for perceiving, understanding,\nand processing visual information by connecting visual receptor with large\nlanguage models (LLMs). However, current assessments mainly focus on\nrecognizing and reasoning abilities, lacking direct evaluation of\nconversational skills and neglecting visual storytelling abilities. In this\npaper, we propose an evaluation method that uses strong LLMs as judges to\ncomprehensively evaluate the various abilities of LVLMs. Firstly, we construct\na comprehensive visual dialogue dataset TouchStone, consisting of open-world\nimages and questions, covering five major categories of abilities and 27\nsubtasks. This dataset not only covers fundamental recognition and\ncomprehension but also extends to literary creation. Secondly, by integrating\ndetailed image annotations we effectively transform the multimodal input\ncontent into a form understandable by LLMs. This enables us to employ advanced\nLLMs for directly evaluating the quality of the multimodal dialogue without\nrequiring human intervention. Through validation, we demonstrate that powerful\nLVLMs, such as GPT-4, can effectively score dialogue quality by leveraging\ntheir textual capabilities alone, aligning with human preferences. We hope our\nwork can serve as a touchstone for LVLMs' evaluation and pave the way for\nbuilding stronger LVLMs. The evaluation code is available at\nhttps://github.com/OFA-Sys/TouchStone.\n","authors":["Shuai Bai","Shusheng Yang","Jinze Bai","Peng Wang","Xingxuan Zhang","Junyang Lin","Xinggang Wang","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16890v2.pdf","comment":"https://github.com/OFA-Sys/TouchStone"},{"id":"http://arxiv.org/abs/2309.01645v1","updated":"2023-09-04T14:54:39Z","published":"2023-09-04T14:54:39Z","title":"Exploring the effectiveness of ChatGPT-based feedback compared with\n teacher feedback and self-feedback: Evidence from Chinese to English\n translation","summary":" ChatGPT,a cutting-edge AI-powered Chatbot,can quickly generate responses on\ngiven commands. While it was reported that ChatGPT had the capacity to deliver\nuseful feedback, it is still unclear about its effectiveness compared with\nconventional feedback approaches,such as teacher feedback (TF) and\nself-feedback (SF). To address this issue, this study compared the revised\nChinese to English translation texts produced by Chinese Master of Translation\nand Interpretation (MTI) students,who learned English as a Second/Foreign\nLanguage (ESL/EFL), based on three feedback types (i.e., ChatGPT-based\nfeedback, TF and SF). The data was analyzed using BLEU score to gauge the\noverall translation quality as well as Coh-Metrix to examine linguistic\nfeatures across three dimensions: lexicon, syntax, and cohesion.The findings\nrevealed that TF- and SF-guided translation texts surpassed those with\nChatGPT-based feedback, as indicated by the BLEU score. In terms of linguistic\nfeatures,ChatGPT-based feedback demonstrated superiority, particularly in\nenhancing lexical capability and referential cohesion in the translation texts.\nHowever, TF and SF proved more effective in developing syntax-related skills,as\nit addressed instances of incorrect usage of the passive voice. These diverse\noutcomes indicate ChatGPT's potential as a supplementary resource,\ncomplementing traditional teacher-led methods in translation practice.\n","authors":["Siyi Cao","Linping Zhong"],"pdf_url":"https://arxiv.org/pdf/2309.01645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01618v1","updated":"2023-09-04T14:00:12Z","published":"2023-09-04T14:00:12Z","title":"Critical Behavioral Traits Foster Peer Engagement in Online Mental\n Health Communities","summary":" Online Mental Health Communities (OMHCs), such as Reddit, have witnessed a\nsurge in popularity as go-to platforms for seeking information and support in\nmanaging mental health needs. Platforms like Reddit offer immediate\ninteractions with peers, granting users a vital space for seeking mental health\nassistance. However, the largely unregulated nature of these platforms\nintroduces intricate challenges for both users and society at large. This study\nexplores the factors that drive peer engagement within counseling threads,\naiming to enhance our understanding of this critical phenomenon. We introduce\nBeCOPE, a novel behavior encoded Peer counseling dataset comprising over 10,118\nposts and 58,279 comments sourced from 21 mental health-specific subreddits.\nThe dataset is annotated using three major fine-grained behavior labels: (a)\nintent, (b) criticism, and (c) readability, along with the emotion labels. Our\nanalysis indicates the prominence of ``self-criticism'' as the most prevalent\nform of criticism expressed by help-seekers, accounting for a significant 43%\nof interactions. Intriguingly, we observe that individuals who explicitly\nexpress their need for help are 18.01% more likely to receive assistance\ncompared to those who present ``surveys'' or engage in ``rants.'' Furthermore,\nwe highlight the pivotal role of well-articulated problem descriptions, showing\nthat superior readability effectively doubles the likelihood of receiving the\nsought-after support. Our study emphasizes the essential role of OMHCs in\noffering personalized guidance and unveils behavior-driven engagement patterns.\n","authors":["Aseem Srivastava","Tanya Gupta","Alison Cerezo","Sarah Peregrine"," Lord","Md Shad Akhtar","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2309.01618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01606v1","updated":"2023-09-04T13:44:50Z","published":"2023-09-04T13:44:50Z","title":"Geo-Encoder: A Chunk-Argument Bi-Encoder Framework for Chinese\n Geographic Re-Ranking","summary":" Chinese geographic re-ranking task aims to find the most relevant addresses\namong retrieved candidates, which is crucial for location-related services such\nas navigation maps. Unlike the general sentences, geographic contexts are\nclosely intertwined with geographical concepts, from general spans (e.g.,\nprovince) to specific spans (e.g., road). Given this feature, we propose an\ninnovative framework, namely Geo-Encoder, to more effectively integrate Chinese\ngeographical semantics into re-ranking pipelines. Our methodology begins by\nemploying off-the-shelf tools to associate text with geographical spans,\ntreating them as chunking units. Then, we present a multi-task learning module\nto simultaneously acquire an effective attention matrix that determines chunk\ncontributions to extra semantic representations. Furthermore, we put forth an\nasynchronous update mechanism for the proposed addition task, aiming to guide\nthe model capable of effectively focusing on specific chunks. Experiments on\ntwo distinct Chinese geographic re-ranking datasets, show that the Geo-Encoder\nachieves significant improvements when compared to state-of-the-art baselines.\nNotably, it leads to a substantial improvement in the Hit@1 score of MGEO-BERT,\nincreasing it by 6.22% from 62.76 to 68.98 on the GeoTES dataset.\n","authors":["Yong Cao","Ruixue Ding","Boli Chen","Xianzhi Li","Min Chen","Daniel Hershcovich","Pengjun Xie","Fei Huang"],"pdf_url":"https://arxiv.org/pdf/2309.01606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01576v1","updated":"2023-09-04T13:02:27Z","published":"2023-09-04T13:02:27Z","title":"A Comparative Analysis of Pretrained Language Models for Text-to-Speech","summary":" State-of-the-art text-to-speech (TTS) systems have utilized pretrained\nlanguage models (PLMs) to enhance prosody and create more natural-sounding\nspeech. However, while PLMs have been extensively researched for natural\nlanguage understanding (NLU), their impact on TTS has been overlooked. In this\nstudy, we aim to address this gap by conducting a comparative analysis of\ndifferent PLMs for two TTS tasks: prosody prediction and pause prediction.\nFirstly, we trained a prosody prediction model using 15 different PLMs. Our\nfindings revealed a logarithmic relationship between model size and quality, as\nwell as significant performance differences between neutral and expressive\nprosody. Secondly, we employed PLMs for pause prediction and found that the\ntask was less sensitive to small models. We also identified a strong\ncorrelation between our empirical results and the GLUE scores obtained for\nthese language models. To the best of our knowledge, this is the first study of\nits kind to investigate the impact of different PLMs on TTS.\n","authors":["Marcel Granero-Moya","Penny Karanasou","Sri Karlapati","Bastian Schnell","Nicole Peinelt","Alexis Moinet","Thomas Drugman"],"pdf_url":"https://arxiv.org/pdf/2309.01576v1.pdf","comment":"Accepted for presentation at the 12th ISCA Speech Synthesis Workshop\n (SSW) in Grenoble, France, from 26th to 28th August 2023"},{"id":"http://arxiv.org/abs/2207.00430v3","updated":"2023-09-04T11:45:12Z","published":"2022-07-01T13:49:30Z","title":"How trial-to-trial learning shapes mappings in the mental lexicon:\n Modelling Lexical Decision with Linear Discriminative Learning","summary":" Trial-to-trial effects have been found in a number of studies, indicating\nthat processing a stimulus influences responses in subsequent trials. A special\ncase are priming effects which have been modelled successfully with\nerror-driven learning (Marsolek, 2008), implying that participants are\ncontinuously learning during experiments. This study investigates whether\ntrial-to-trial learning can be detected in an unprimed lexical decision\nexperiment. We used the Discriminative Lexicon Model (DLM; Baayen et al.,\n2019), a model of the mental lexicon with meaning representations from\ndistributional semantics, which models error-driven incremental learning with\nthe Widrow-Hoff rule. We used data from the British Lexicon Project (BLP;\nKeuleers et al., 2012) and simulated the lexical decision experiment with the\nDLM on a trial-by-trial basis for each subject individually. Then, reaction\ntimes were predicted with Generalised Additive Models (GAMs), using measures\nderived from the DLM simulations as predictors. We extracted measures from two\nsimulations per subject (one with learning updates between trials and one\nwithout), and used them as input to two GAMs. Learning-based models showed\nbetter model fit than the non-learning ones for the majority of subjects. Our\nmeasures also provide insights into lexical processing and individual\ndifferences. This demonstrates the potential of the DLM to model behavioural\ndata and leads to the conclusion that trial-to-trial learning can indeed be\ndetected in unprimed lexical decision. Our results support the possibility that\nour lexical knowledge is subject to continuous changes.\n","authors":["Maria Heitmeier","Yu-Ying Chuang","R. Harald Baayen"],"pdf_url":"https://arxiv.org/pdf/2207.00430v3.pdf","comment":"48 pages, 13 figures; revised version"},{"id":"http://arxiv.org/abs/2309.01538v1","updated":"2023-09-04T11:38:02Z","published":"2023-09-04T11:38:02Z","title":"ChatRule: Mining Logical Rules with Large Language Models for Knowledge\n Graph Reasoning","summary":" Logical rules are essential for uncovering the logical connections between\nrelations, which could improve the reasoning performance and provide\ninterpretable results on knowledge graphs (KGs). Although there have been many\nefforts to mine meaningful logical rules over KGs, existing methods suffer from\nthe computationally intensive searches over the rule space and a lack of\nscalability for large-scale KGs. Besides, they often ignore the semantics of\nrelations which is crucial for uncovering logical connections. Recently, large\nlanguage models (LLMs) have shown impressive performance in the field of\nnatural language processing and various applications, owing to their emergent\nability and generalizability. In this paper, we propose a novel framework,\nChatRule, unleashing the power of large language models for mining logical\nrules over knowledge graphs. Specifically, the framework is initiated with an\nLLM-based rule generator, leveraging both the semantic and structural\ninformation of KGs to prompt LLMs to generate logical rules. To refine the\ngenerated rules, a rule ranking module estimates the rule quality by\nincorporating facts from existing KGs. Last, a rule validator harnesses the\nreasoning ability of LLMs to validate the logical correctness of ranked rules\nthrough chain-of-thought reasoning. ChatRule is evaluated on four large-scale\nKGs, w.r.t. different rule quality metrics and downstream tasks, showing the\neffectiveness and scalability of our method.\n","authors":["Linhao Luo","Jiaxin Ju","Bo Xiong","Yuan-Fang Li","Gholamreza Haffari","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2309.01538v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.01522v1","updated":"2023-09-04T11:05:10Z","published":"2023-09-04T11:05:10Z","title":"What are Public Concerns about ChatGPT? A Novel Self-Supervised Neural\n Topic Model Tells You","summary":" The recently released artificial intelligence conversational agent, ChatGPT,\nhas gained significant attention in academia and real life. A multitude of\nearly ChatGPT users eagerly explore its capabilities and share their opinions\non it via social media. Both user queries and social media posts express public\nconcerns regarding this advanced dialogue system. To mine public concerns about\nChatGPT, a novel Self-Supervised neural Topic Model (SSTM), which formalizes\ntopic modeling as a representation learning procedure, is proposed in this\npaper. Extensive experiments have been conducted on Twitter posts about ChatGPT\nand queries asked by ChatGPT users. And experimental results demonstrate that\nthe proposed approach could extract higher quality public concerns with\nimproved interpretability and diversity, surpassing the performance of\nstate-of-the-art approaches.\n","authors":["Rui Wang","Xing Liu","Yanan Wang","Haiping Huang"],"pdf_url":"https://arxiv.org/pdf/2309.01522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11729v2","updated":"2023-09-04T10:20:30Z","published":"2023-07-21T17:40:47Z","title":"OUTFOX: LLM-generated Essay Detection through In-context Learning with\n Adversarially Generated Examples","summary":" Large Language Models (LLMs) have achieved human-level fluency in text\ngeneration, making it difficult to distinguish between human-written and\nLLM-generated texts. This poses a growing risk of misuse of LLMs and demands\nthe development of detectors to identify LLM-generated texts. However, existing\ndetectors lack robustness against attacks: they degrade detection accuracy by\nsimply paraphrasing LLM-generated texts. Furthermore, a malicious user might\nattempt to deliberately evade the detectors based on detection results, but\nthis has not been assumed in previous studies. In this paper, we propose\nOUTFOX, a framework that improves the robustness of LLM-generated-text\ndetectors by allowing both the detector and the attacker to consider each\nother's output. In this framework, the attacker uses the detector's prediction\nlabels as examples for in-context learning and adversarially generates essays\nthat are harder to detect, while the detector uses the adversarially generated\nessays as examples for in-context learning to learn to detect essays from a\nstrong attacker. Experiments in the domain of student essays show that the\nproposed detector improves the detection performance on the attacker-generated\ntexts by up to +41.3 points in F1-score. Furthermore, the proposed detector\nshows a state-of-the-art detection performance: up to 96.9 points in F1-score,\nbeating existing detectors on non-attacked texts. Finally, the proposed\nattacker drastically degrades the performance of detectors by up to -57.0\npoints F1-score, massively outperforming the baseline paraphrasing method for\nevading detection.\n","authors":["Ryuto Koike","Masahiro Kaneko","Naoaki Okazaki"],"pdf_url":"https://arxiv.org/pdf/2307.11729v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01456v1","updated":"2023-09-04T09:05:17Z","published":"2023-09-04T09:05:17Z","title":"LLM and Infrastructure as a Code use case","summary":" Cloud computing and the evolution of management methodologies such as Lean\nManagement or Agile entail a profound transformation in both system\nconstruction and maintenance approaches. These practices are encompassed within\nthe term \"DevOps.\" This descriptive approach to an information system or\napplication, alongside the configuration of its constituent components, has\nnecessitated the development of descriptive languages paired with specialized\nengines for automating systems administration tasks. Among these, the tandem of\nAnsible (engine) and YAML (descriptive language) stands out as the two most\nprevalent tools in the market, facing notable competition mainly from\nTerraform. The current document presents an inquiry into a solution for\ngenerating and managing Ansible YAML roles and playbooks, utilizing Generative\nLLMs (Language Models) to translate human descriptions into code. Our efforts\nare focused on identifying plausible directions and outlining the potential\nindustrial applications.\n Note: For the purpose of this experiment, we have opted against the use of\nAnsible Lightspeed. This is due to its reliance on an IBM Watson model, for\nwhich we have not found any publicly available references. Comprehensive\ninformation regarding this remarkable technology can be found directly on our\npartner RedHat's website,\nhttps://www.redhat.com/en/about/press-releases/red-hat-introduces-ansible-lightspeed-ai-driven-it-automation\n","authors":["Thibault Chanus","Michael Aubertin"],"pdf_url":"https://arxiv.org/pdf/2309.01456v1.pdf","comment":"in French language"},{"id":"http://arxiv.org/abs/2309.01455v1","updated":"2023-09-04T09:03:53Z","published":"2023-09-04T09:03:53Z","title":"NumHG: A Dataset for Number-Focused Headline Generation","summary":" Headline generation, a key task in abstractive summarization, strives to\ncondense a full-length article into a succinct, single line of text. Notably,\nwhile contemporary encoder-decoder models excel based on the ROUGE metric, they\noften falter when it comes to the precise generation of numerals in headlines.\nWe identify the lack of datasets providing fine-grained annotations for\naccurate numeral generation as a major roadblock. To address this, we introduce\na new dataset, the NumHG, and provide over 27,000 annotated numeral-rich news\narticles for detailed investigation. Further, we evaluate five well-performing\nmodels from previous headline generation tasks using human evaluation in terms\nof numerical accuracy, reasonableness, and readability. Our study reveals a\nneed for improvement in numerical accuracy, demonstrating the potential of the\nNumHG dataset to drive progress in number-focused headline generation and\nstimulate further discussions in numeral-focused text generation.\n","authors":["Jian-Tao Huang","Chung-Chi Chen","Hen-Hsen Huang","Hsin-Hsi Chen"],"pdf_url":"https://arxiv.org/pdf/2309.01455v1.pdf","comment":"NumEval@SemEval-2024 Dataset"},{"id":"http://arxiv.org/abs/2309.01446v1","updated":"2023-09-04T08:54:20Z","published":"2023-09-04T08:54:20Z","title":"Open Sesame! Universal Black Box Jailbreaking of Large Language Models","summary":" Large language models (LLMs), designed to provide helpful and safe responses,\noften rely on alignment techniques to align with user intent and social\nguidelines. Unfortunately, this alignment can be exploited by malicious actors\nseeking to manipulate an LLM's outputs for unintended purposes. In this paper\nwe introduce a novel approach that employs a genetic algorithm (GA) to\nmanipulate LLMs when model architecture and parameters are inaccessible. The GA\nattack works by optimizing a universal adversarial prompt that -- when combined\nwith a user's query -- disrupts the attacked model's alignment, resulting in\nunintended and potentially harmful outputs. Our novel approach systematically\nreveals a model's limitations and vulnerabilities by uncovering instances where\nits responses deviate from expected behavior. Through extensive experiments we\ndemonstrate the efficacy of our technique, thus contributing to the ongoing\ndiscussion on responsible AI development by providing a diagnostic tool for\nevaluating and enhancing alignment of LLMs with human intent. To our knowledge\nthis is the first automated universal black box jailbreak attack.\n","authors":["Raz Lapid","Ron Langberg","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2309.01446v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01437v1","updated":"2023-09-04T08:35:05Z","published":"2023-09-04T08:35:05Z","title":"SememeASR: Boosting Performance of End-to-End Speech Recognition against\n Domain and Long-Tailed Data Shift with Sememe Semantic Knowledge","summary":" Recently, excellent progress has been made in speech recognition. However,\npure data-driven approaches have struggled to solve the problem in\ndomain-mismatch and long-tailed data. Considering that knowledge-driven\napproaches can help data-driven approaches alleviate their flaws, we introduce\nsememe-based semantic knowledge information to speech recognition (SememeASR).\nSememe, according to the linguistic definition, is the minimum semantic unit in\na language and is able to represent the implicit semantic information behind\neach word very well. Our experiments show that the introduction of sememe\ninformation can improve the effectiveness of speech recognition. In addition,\nour further experiments show that sememe knowledge can improve the model's\nrecognition of long-tailed data and enhance the model's domain generalization\nability.\n","authors":["Jiaxu Zhu","Changhe Song","Zhiyong Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2309.01437v1.pdf","comment":"Accepted by INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2309.01431v1","updated":"2023-09-04T08:28:44Z","published":"2023-09-04T08:28:44Z","title":"Benchmarking Large Language Models in Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) is a promising approach for mitigating\nthe hallucination of large language models (LLMs). However, existing research\nlacks rigorous evaluation of the impact of retrieval-augmented generation on\ndifferent large language models, which make it challenging to identify the\npotential bottlenecks in the capabilities of RAG for different LLMs. In this\npaper, we systematically investigate the impact of Retrieval-Augmented\nGeneration on large language models. We analyze the performance of different\nlarge language models in 4 fundamental abilities required for RAG, including\nnoise robustness, negative rejection, information integration, and\ncounterfactual robustness. To this end, we establish Retrieval-Augmented\nGeneration Benchmark (RGB), a new corpus for RAG evaluation in both English and\nChinese. RGB divides the instances within the benchmark into 4 separate\ntestbeds based on the aforementioned fundamental abilities required to resolve\nthe case. Then we evaluate 6 representative LLMs on RGB to diagnose the\nchallenges of current LLMs when applying RAG. Evaluation reveals that while\nLLMs exhibit a certain degree of noise robustness, they still struggle\nsignificantly in terms of negative rejection, information integration, and\ndealing with false information. The aforementioned assessment outcomes indicate\nthat there is still a considerable journey ahead to effectively apply RAG to\nLLMs.\n","authors":["Jiawei Chen","Hongyu Lin","Xianpei Han","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2309.01431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01413v1","updated":"2023-09-04T07:48:52Z","published":"2023-09-04T07:48:52Z","title":"Hateful Messages: A Conversational Data Set of Hate Speech produced by\n Adolescents on Discord","summary":" With the rise of social media, a rise of hateful content can be observed.\nEven though the understanding and definitions of hate speech varies, platforms,\ncommunities, and legislature all acknowledge the problem. Therefore,\nadolescents are a new and active group of social media users. The majority of\nadolescents experience or witness online hate speech. Research in the field of\nautomated hate speech classification has been on the rise and focuses on\naspects such as bias, generalizability, and performance. To increase\ngeneralizability and performance, it is important to understand biases within\nthe data. This research addresses the bias of youth language within hate speech\nclassification and contributes by providing a modern and anonymized hate speech\nyouth language data set consisting of 88.395 annotated chat messages. The data\nset consists of publicly available online messages from the chat platform\nDiscord. ~6,42% of the messages were classified by a self-developed annotation\nschema as hate speech. For 35.553 messages, the user profiles provided age\nannotations setting the average author age to under 20 years old.\n","authors":["Jan Fillies","Silvio Peikert","Adrian Paschke"],"pdf_url":"https://arxiv.org/pdf/2309.01413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01398v1","updated":"2023-09-04T07:00:26Z","published":"2023-09-04T07:00:26Z","title":"Zero-shot information extraction from radiological reports using ChatGPT","summary":" Electronic health records contain an enormous amount of valuable information,\nbut many are recorded in free text. Information extraction is the strategy to\ntransform the sequence of characters into structured data, which can be\nemployed for secondary analysis. However, the traditional information\nextraction components, such as named entity recognition and relation\nextraction, require annotated data to optimize the model parameters, which has\nbecome one of the major bottlenecks in building information extraction systems.\nWith the large language models achieving good performances on various\ndownstream NLP tasks without parameter tuning, it becomes possible to use large\nlanguage models for zero-shot information extraction. In this study, we aim to\nexplore whether the most popular large language model, ChatGPT, can extract\nuseful information from the radiological reports. We first design the prompt\ntemplate for the interested information in the CT reports. Then, we generate\nthe prompts by combining the prompt template with the CT reports as the inputs\nof ChatGPT to obtain the responses. A post-processing module is developed to\ntransform the responses into structured extraction results. We conducted the\nexperiments with 847 CT reports collected from Peking University Cancer\nHospital. The experimental results indicate that ChatGPT can achieve\ncompetitive performances for some extraction tasks compared with the baseline\ninformation extraction system, but some limitations need to be further\nimproved.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Xudong Lu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2309.01398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01370v1","updated":"2023-09-04T05:36:58Z","published":"2023-09-04T05:36:58Z","title":"ReOnto: A Neuro-Symbolic Approach for Biomedical Relation Extraction","summary":" Relation Extraction (RE) is the task of extracting semantic relationships\nbetween entities in a sentence and aligning them to relations defined in a\nvocabulary, which is generally in the form of a Knowledge Graph (KG) or an\nontology. Various approaches have been proposed so far to address this task.\nHowever, applying these techniques to biomedical text often yields\nunsatisfactory results because it is hard to infer relations directly from\nsentences due to the nature of the biomedical relations. To address these\nissues, we present a novel technique called ReOnto, that makes use of neuro\nsymbolic knowledge for the RE task. ReOnto employs a graph neural network to\nacquire the sentence representation and leverages publicly accessible\nontologies as prior knowledge to identify the sentential relation between two\nentities. The approach involves extracting the relation path between the two\nentities from the ontology. We evaluate the effect of using symbolic knowledge\nfrom ontologies with graph neural networks. Experimental results on two public\nbiomedical datasets, BioRel and ADE, show that our method outperforms all the\nbaselines (approximately by 3\\%).\n","authors":["Monika Jain","Kuldeep Singh","Raghava Mutharaju"],"pdf_url":"https://arxiv.org/pdf/2309.01370v1.pdf","comment":"Accepted in ECML 2023"},{"id":"http://arxiv.org/abs/2308.06035v2","updated":"2023-09-04T05:29:28Z","published":"2023-08-11T09:30:07Z","title":"Evidence of Human-Like Visual-Linguistic Integration in Multimodal Large\n Language Models During Predictive Language Processing","summary":" The advanced language processing abilities of large language models (LLMs)\nhave stimulated debate over their capacity to replicate human-like cognitive\nprocesses. One differentiating factor between language processing in LLMs and\nhumans is that language input is often grounded in several perceptual\nmodalities, whereas most LLMs process solely text-based information. Multimodal\ngrounding allows humans to integrate - e.g. visual context with linguistic\ninformation and thereby place constraints on the space of upcoming words,\nreducing cognitive load and improving comprehension. Recent multimodal LLMs\n(mLLMs) combine a visual-linguistic embedding space with a transformer type\nattention mechanism for next-word prediction. Here we ask whether predictive\nlanguage processing based on multimodal input in mLLMs aligns with humans.\nTwo-hundred participants watched short audio-visual clips and estimated\npredictability of an upcoming verb or noun. The same clips were processed by\nthe mLLM CLIP, with predictability scores based on comparing image and text\nfeature vectors. Eye-tracking was used to estimate what visual features\nparticipants attended to, and CLIP's visual attention weights were recorded. We\nfind that alignment of predictability scores was driven by multimodality of\nCLIP (no alignment for a unimodal state-of-the-art LLM) and by the attention\nmechanism (no alignment when attention weights were perturbated or when the\nsame input was fed to a multimodal model without attention). We further find a\nsignificant spatial overlap between CLIP's visual attention weights and human\neye-tracking data. Results suggest that comparable processes of integrating\nmultimodal information, guided by attention to relevant visual features,\nsupports predictive language processing in mLLMs and humans.\n","authors":["Viktor Kewenig","Christopher Edwards","Quitterie Lacome DEstalenx","Akilles Rechardt","Jeremy I Skipper","Gabriella Vigliocco"],"pdf_url":"https://arxiv.org/pdf/2308.06035v2.pdf","comment":"13 pages, 4 figures, submitted to journal"},{"id":"http://arxiv.org/abs/2302.07371v2","updated":"2023-09-04T04:34:57Z","published":"2023-02-14T22:07:57Z","title":"BiasTestGPT: Using ChatGPT for Social Bias Testing of Language Models","summary":" Pretrained Language Models (PLMs) harbor inherent social biases that can\nresult in harmful real-world implications. Such social biases are measured\nthrough the probability values that PLMs output for different social groups and\nattributes appearing in a set of test sentences. However, bias testing is\ncurrently cumbersome since the test sentences are generated either from a\nlimited set of manual templates or need expensive crowd-sourcing. We instead\npropose using ChatGPT for controllable generation of test sentences, given any\narbitrary user-specified combination of social groups and attributes appearing\nin the test sentences. When compared to template-based methods, our approach\nusing ChatGPT for test sentence generation is superior in detecting social\nbias, especially in challenging settings such as intersectional biases. We\npresent an open-source comprehensive bias testing framework (BiasTestGPT),\nhosted on HuggingFace, that can be plugged into any open-source PLM for bias\ntesting. We provide a large diverse dataset of test sentences generated by\nChatGPT that satisfies the specified social group and attribute requirements\nand matches the quality of human-generated sentences. We thus enable seamless\nopen-ended social bias testing of PLMs through an automatic large-scale\ngeneration of diverse test sentences for any combination of social categories\nand attributes.\n","authors":["Rafal Kocielnik","Shrimai Prabhumoye","Vivian Zhang","Roy Jiang","R. Michael Alvarez","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2302.07371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01352v1","updated":"2023-09-04T04:31:24Z","published":"2023-09-04T04:31:24Z","title":"Self-driven Grounding: Large Language Model Agents with Automatical\n Language-aligned Skill Learning","summary":" Large language models (LLMs) show their powerful automatic reasoning and\nplanning capability with a wealth of semantic knowledge about the human world.\nHowever, the grounding problem still hinders the applications of LLMs in the\nreal-world environment. Existing studies try to fine-tune the LLM or utilize\npre-defined behavior APIs to bridge the LLMs and the environment, which not\nonly costs huge human efforts to customize for every single task but also\nweakens the generality strengths of LLMs. To autonomously ground the LLM onto\nthe environment, we proposed the Self-Driven Grounding (SDG) framework to\nautomatically and progressively ground the LLM with self-driven skill learning.\nSDG first employs the LLM to propose the hypothesis of sub-goals to achieve\ntasks and then verify the feasibility of the hypothesis via interacting with\nthe underlying environment. Once verified, SDG can then learn generalized\nskills with the guidance of these successfully grounded subgoals. These skills\ncan be further utilized to accomplish more complex tasks which fail to pass the\nverification phase. Verified in the famous instruction following task\nset-BabyAI, SDG achieves comparable performance in the most challenging tasks\ncompared with imitation learning methods that cost millions of demonstrations,\nproving the effectiveness of learned skills and showing the feasibility and\nefficiency of our framework.\n","authors":["Shaohui Peng","Xing Hu","Qi Yi","Rui Zhang","Jiaming Guo","Di Huang","Zikang Tian","Ruizhi Chen","Zidong Du","Qi Guo","Yunji Chen","Ling Li"],"pdf_url":"https://arxiv.org/pdf/2309.01352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01339v1","updated":"2023-09-04T03:49:30Z","published":"2023-09-04T03:49:30Z","title":"UniSA: Unified Generative Framework for Sentiment Analysis","summary":" Sentiment analysis is a crucial task that aims to understand people's\nemotional states and predict emotional categories based on multimodal\ninformation. It consists of several subtasks, such as emotion recognition in\nconversation (ERC), aspect-based sentiment analysis (ABSA), and multimodal\nsentiment analysis (MSA). However, unifying all subtasks in sentiment analysis\npresents numerous challenges, including modality alignment, unified\ninput/output forms, and dataset bias. To address these challenges, we propose a\nTask-Specific Prompt method to jointly model subtasks and introduce a\nmultimodal generative framework called UniSA. Additionally, we organize the\nbenchmark datasets of main subtasks into a new Sentiment Analysis Evaluation\nbenchmark, SAEval. We design novel pre-training tasks and training methods to\nenable the model to learn generic sentiment knowledge among subtasks to improve\nthe model's multimodal sentiment perception ability. Our experimental results\nshow that UniSA performs comparably to the state-of-the-art on all subtasks and\ngeneralizes well to various subtasks in sentiment analysis.\n","authors":["Zaijing Li","Ting-En Lin","Yuchuan Wu","Meng Liu","Fengxiao Tang","Ming Zhao","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2309.01339v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.12014v2","updated":"2023-09-04T03:32:05Z","published":"2023-08-23T09:11:13Z","title":"From Instructions to Intrinsic Human Values -- A Survey of Alignment\n Goals for Big Models","summary":" Big models, exemplified by Large Language Models (LLMs), are models typically\npre-trained on massive data and comprised of enormous parameters, which not\nonly obtain significantly improved performance across diverse tasks but also\npresent emergent capabilities absent in smaller models. However, the growing\nintertwining of big models with everyday human lives poses potential risks and\nmight cause serious social harm. Therefore, many efforts have been made to\nalign LLMs with humans to make them better follow user instructions and satisfy\nhuman preferences. Nevertheless, `what to align with' has not been fully\ndiscussed, and inappropriate alignment goals might even backfire. In this\npaper, we conduct a comprehensive survey of different alignment goals in\nexisting work and trace their evolution paths to help identify the most\nessential goal. Particularly, we investigate related works from two\nperspectives: the definition of alignment goals and alignment evaluation. Our\nanalysis encompasses three distinct levels of alignment goals and reveals a\ngoal transformation from fundamental abilities to value orientation, indicating\nthe potential of intrinsic human values as the alignment goal for enhanced\nLLMs. Based on such results, we further discuss the challenges of achieving\nsuch intrinsic value alignment and provide a collection of available resources\nfor future research on the alignment of big models.\n","authors":["Jing Yao","Xiaoyuan Yi","Xiting Wang","Jindong Wang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.12014v2.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.13387v2","updated":"2023-09-04T01:47:30Z","published":"2023-08-25T14:02:12Z","title":"Do-Not-Answer: A Dataset for Evaluating Safeguards in LLMs","summary":" With the rapid evolution of large language models (LLMs), new and\nhard-to-predict harmful capabilities are emerging. This requires developers to\nbe able to identify risks through the evaluation of \"dangerous capabilities\" in\norder to responsibly deploy LLMs. In this work, we collect the first\nopen-source dataset to evaluate safeguards in LLMs, and deploy safer\nopen-source LLMs at a low cost. Our dataset is curated and filtered to consist\nonly of instructions that responsible language models should not follow. We\nannotate and assess the responses of six popular LLMs to these instructions.\nBased on our annotation, we proceed to train several BERT-like classifiers, and\nfind that these small classifiers can achieve results that are comparable with\nGPT-4 on automatic safety evaluation. Warning: this paper contains example data\nthat may be offensive, harmful, or biased.\n","authors":["Yuxia Wang","Haonan Li","Xudong Han","Preslav Nakov","Timothy Baldwin"],"pdf_url":"https://arxiv.org/pdf/2308.13387v2.pdf","comment":"18 pages, 9 figures, 11 tables"},{"id":"http://arxiv.org/abs/2309.02466v1","updated":"2023-09-04T22:11:26Z","published":"2023-09-04T22:11:26Z","title":"Minimal Effective Theory for Phonotactic Memory: Capturing Local\n Correlations due to Errors in Speech","summary":" Spoken language evolves constrained by the economy of speech, which depends\non factors such as the structure of the human mouth. This gives rise to local\nphonetic correlations in spoken words. Here we demonstrate that these local\ncorrelations facilitate the learning of spoken words by reducing their\ninformation content. We do this by constructing a locally-connected\ntensor-network model, inspired by similar variational models used for many-body\nphysics, which exploits these local phonetic correlations to facilitate the\nlearning of spoken words. The model is therefore a minimal model of phonetic\nmemory, where \"learning to pronounce\" and \"learning a word\" are one and the\nsame. A consequence of which is the learned ability to produce new words which\nare phonetically reasonable for the target language; as well as providing a\nhierarchy of the most likely errors that could be produced during the action of\nspeech. We test our model against Latin and Turkish words. (The code is\navailable on GitHub.)\n","authors":["Paul Myles Eugenio"],"pdf_url":"https://arxiv.org/pdf/2309.02466v1.pdf","comment":"16 pages; 7 figs"},{"id":"http://arxiv.org/abs/2309.02465v1","updated":"2023-09-04T21:22:28Z","published":"2023-09-04T21:22:28Z","title":"Towards Foundational AI Models for Additive Manufacturing: Language\n Models for G-Code Debugging, Manipulation, and Comprehension","summary":" 3D printing or additive manufacturing is a revolutionary technology that\nenables the creation of physical objects from digital models. However, the\nquality and accuracy of 3D printing depend on the correctness and efficiency of\nthe G-code, a low-level numerical control programming language that instructs\n3D printers how to move and extrude material. Debugging G-code is a challenging\ntask that requires a syntactic and semantic understanding of the G-code format\nand the geometry of the part to be printed. In this paper, we present the first\nextensive evaluation of six state-of-the-art foundational large language models\n(LLMs) for comprehending and debugging G-code files for 3D printing. We design\neffective prompts to enable pre-trained LLMs to understand and manipulate\nG-code and test their performance on various aspects of G-code debugging and\nmanipulation, including detection and correction of common errors and the\nability to perform geometric transformations. We analyze their strengths and\nweaknesses for understanding complete G-code files. We also discuss the\nimplications and limitations of using LLMs for G-code comprehension.\n","authors":["Anushrut Jignasu","Kelly Marshall","Baskar Ganapathysubramanian","Aditya Balu","Chinmay Hegde","Adarsh Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.02465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02459v1","updated":"2023-09-04T08:52:59Z","published":"2023-09-04T08:52:59Z","title":"Text-Only Domain Adaptation for End-to-End Speech Recognition through\n Down-Sampling Acoustic Representation","summary":" Mapping two modalities, speech and text, into a shared representation space,\nis a research topic of using text-only data to improve end-to-end automatic\nspeech recognition (ASR) performance in new domains. However, the length of\nspeech representation and text representation is inconsistent. Although the\nprevious method up-samples the text representation to align with acoustic\nmodality, it may not match the expected actual duration. In this paper, we\nproposed novel representations match strategy through down-sampling acoustic\nrepresentation to align with text modality. By introducing a continuous\nintegrate-and-fire (CIF) module generating acoustic representations consistent\nwith token length, our ASR model can learn unified representations from both\nmodalities better, allowing for domain adaptation using text-only data of the\ntarget domain. Experiment results of new domain data demonstrate the\neffectiveness of the proposed method.\n","authors":["Jiaxu Zhu","Weinan Tong","Yaoxun Xu","Changhe Song","Zhiyong Wu","Zhao You","Dan Su","Dong Yu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2309.02459v1.pdf","comment":"Accepted by INTERSPEECH 2023. arXiv admin note: text overlap with\n arXiv:2309.01437"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.01860v1","updated":"2023-09-04T23:31:29Z","published":"2023-09-04T23:31:29Z","title":"Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition\n and Translation","summary":" In this paper, we devise a mechanism for the addition of multi-modal\ninformation with an existing pipeline for continuous sign language recognition\nand translation. In our procedure, we have incorporated optical flow\ninformation with RGB images to enrich the features with movement-related\ninformation. This work studies the feasibility of such modality inclusion using\na cross-modal encoder. The plugin we have used is very lightweight and doesn't\nneed to include a separate feature extractor for the new modality in an\nend-to-end manner. We have applied the changes in both sign language\nrecognition and translation, improving the result in each case. We have\nevaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language\nrecognition and the RWTH-PHOENIX-2014T dataset for translation. On the\nrecognition task, our approach reduced the WER by 0.9, and on the translation\ntask, our approach increased most of the BLEU scores by ~0.6 on the test set.\n","authors":["Zaber Ibn Abdul Hakim","Rasman Mubtasim Swargo","Muhammad Abdullah Adnan"],"pdf_url":"https://arxiv.org/pdf/2309.01860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01859v1","updated":"2023-09-04T23:26:11Z","published":"2023-09-04T23:26:11Z","title":"NLLB-CLIP -- train performant multilingual image retrieval model on a\n budget","summary":" Today, the exponential rise of large models developed by academic and\nindustrial institutions with the help of massive computing resources raises the\nquestion of whether someone without access to such resources can make a\nvaluable scientific contribution. To explore this, we tried to solve the\nchallenging task of multilingual image retrieval having a limited budget of\n$1,000. As a result, we present NLLB-CLIP - CLIP model with a text encoder from\nthe NLLB model. To train the model, we used an automatically created dataset of\n106,246 good-quality images with captions in 201 languages derived from the\nLAION COCO dataset. We trained multiple models using image and text encoders of\nvarious sizes and kept different parts of the model frozen during the training.\nWe thoroughly analyzed the trained models using existing evaluation datasets\nand newly created XTD200 and Flickr30k-200 datasets. We show that NLLB-CLIP is\ncomparable in quality to state-of-the-art models and significantly outperforms\nthem on low-resource languages.\n","authors":["Alexander Visheratin"],"pdf_url":"https://arxiv.org/pdf/2309.01859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01858v1","updated":"2023-09-04T23:18:38Z","published":"2023-09-04T23:18:38Z","title":"Towards Universal Image Embeddings: A Large-Scale Dataset and Challenge\n for Generic Image Representations","summary":" Fine-grained and instance-level recognition methods are commonly trained and\nevaluated on specific domains, in a model per domain scenario. Such an\napproach, however, is impractical in real large-scale applications. In this\nwork, we address the problem of universal image embedding, where a single\nuniversal model is trained and used in multiple domains. First, we leverage\nexisting domain-specific datasets to carefully construct a new large-scale\npublic benchmark for the evaluation of universal image embeddings, with 241k\nquery images, 1.4M index images and 2.8M training images across 8 different\ndomains and 349k classes. We define suitable metrics, training and evaluation\nprotocols to foster future research in this area. Second, we provide a\ncomprehensive experimental evaluation on the new dataset, demonstrating that\nexisting approaches and simplistic extensions lead to worse performance than an\nassembly of models trained for each domain separately. Finally, we conducted a\npublic research competition on this topic, leveraging industrial datasets,\nwhich attracted the participation of more than 1k teams worldwide. This\nexercise generated many interesting research ideas and findings which we\npresent in detail. Project webpage: https://cmp.felk.cvut.cz/univ_emb/\n","authors":["Nikolaos-Antonios Ypsilantis","Kaifeng Chen","Bingyi Cao","Mário Lipovský","Pelin Dogan-Schönberger","Grzegorz Makosa","Boris Bluntschli","Mojtaba Seyedhosseini","Ondřej Chum","André Araujo"],"pdf_url":"https://arxiv.org/pdf/2309.01858v1.pdf","comment":"ICCV 2023 Accepted"},{"id":"http://arxiv.org/abs/2309.01855v1","updated":"2023-09-04T23:05:41Z","published":"2023-09-04T23:05:41Z","title":"SMPLitex: A Generative Model and Dataset for 3D Human Texture Estimation\n from Single Image","summary":" We propose SMPLitex, a method for estimating and manipulating the complete 3D\nappearance of humans captured from a single image. SMPLitex builds upon the\nrecently proposed generative models for 2D images, and extends their use to the\n3D domain through pixel-to-surface correspondences computed on the input image.\nTo this end, we first train a generative model for complete 3D human\nappearance, and then fit it into the input image by conditioning the generative\nmodel to the visible parts of the subject. Furthermore, we propose a new\ndataset of high-quality human textures built by sampling SMPLitex conditioned\non subject descriptions and images. We quantitatively and qualitatively\nevaluate our method in 3 publicly available datasets, demonstrating that\nSMPLitex significantly outperforms existing methods for human texture\nestimation while allowing for a wider variety of tasks such as editing,\nsynthesis, and manipulation\n","authors":["Dan Casas","Marc Comino Trinidad"],"pdf_url":"https://arxiv.org/pdf/2309.01855v1.pdf","comment":"Accepted at BMVC 2023. Project website:\n https://dancasas.github.io/projects/SMPLitex"},{"id":"http://arxiv.org/abs/2304.13681v2","updated":"2023-09-04T23:02:18Z","published":"2023-04-26T16:54:10Z","title":"Ray Conditioning: Trading Photo-consistency for Photo-realism in\n Multi-view Image Generation","summary":" Multi-view image generation attracts particular attention these days due to\nits promising 3D-related applications, e.g., image viewpoint editing. Most\nexisting methods follow a paradigm where a 3D representation is first\nsynthesized, and then rendered into 2D images to ensure photo-consistency\nacross viewpoints. However, such explicit bias for photo-consistency sacrifices\nphoto-realism, causing geometry artifacts and loss of fine-scale details when\nthese methods are applied to edit real images. To address this issue, we\npropose ray conditioning, a geometry-free alternative that relaxes the\nphoto-consistency constraint. Our method generates multi-view images by\nconditioning a 2D GAN on a light field prior. With explicit viewpoint control,\nstate-of-the-art photo-realism and identity consistency, our method is\nparticularly suited for the viewpoint editing task.\n","authors":["Eric Ming Chen","Sidhanth Holalkere","Ruyu Yan","Kai Zhang","Abe Davis"],"pdf_url":"https://arxiv.org/pdf/2304.13681v2.pdf","comment":"ICCV 2023 paper. Project page at https://ray-cond.github.io/"},{"id":"http://arxiv.org/abs/2309.01850v1","updated":"2023-09-04T22:46:59Z","published":"2023-09-04T22:46:59Z","title":"Uncertainty in AI: Evaluating Deep Neural Networks on\n Out-of-Distribution Images","summary":" As AI models are increasingly deployed in critical applications, ensuring the\nconsistent performance of models when exposed to unusual situations such as\nout-of-distribution (OOD) or perturbed data, is important. Therefore, this\npaper investigates the uncertainty of various deep neural networks, including\nResNet-50, VGG16, DenseNet121, AlexNet, and GoogleNet, when dealing with such\ndata. Our approach includes three experiments. First, we used the pretrained\nmodels to classify OOD images generated via DALL-E to assess their performance.\nSecond, we built an ensemble from the models' predictions using probabilistic\naveraging for consensus due to its advantages over plurality or majority\nvoting. The ensemble's uncertainty was quantified using average probabilities,\nvariance, and entropy metrics. Our results showed that while ResNet-50 was the\nmost accurate single model for OOD images, the ensemble performed even better,\ncorrectly classifying all images. Third, we tested model robustness by adding\nperturbations (filters, rotations, etc.) to new epistemic images from DALL-E or\nreal-world captures. ResNet-50 was chosen for this being the best performing\nmodel. While it classified 4 out of 5 unperturbed images correctly, it\nmisclassified all of them post-perturbation, indicating a significant\nvulnerability. These misclassifications, which are clear to human observers,\nhighlight AI models' limitations. Using saliency maps, we identified regions of\nthe images that the model considered important for their decisions.\n","authors":["Jamiu Idowu","Ahmed Almasoud"],"pdf_url":"https://arxiv.org/pdf/2309.01850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01842v1","updated":"2023-09-04T22:34:14Z","published":"2023-09-04T22:34:14Z","title":"StereoFlowGAN: Co-training for Stereo and Flow with Unsupervised Domain\n Adaptation","summary":" We introduce a novel training strategy for stereo matching and optical flow\nestimation that utilizes image-to-image translation between synthetic and real\nimage domains. Our approach enables the training of models that excel in real\nimage scenarios while relying solely on ground-truth information from synthetic\nimages. To facilitate task-agnostic domain adaptation and the training of\ntask-specific components, we introduce a bidirectional feature warping module\nthat handles both left-right and forward-backward directions. Experimental\nresults show competitive performance over previous domain translation-based\nmethods, which substantiate the efficacy of our proposed framework, effectively\nleveraging the benefits of unsupervised domain adaptation, stereo matching, and\noptical flow estimation.\n","authors":["Zhexiao Xiong","Feng Qiao","Yu Zhang","Nathan Jacobs"],"pdf_url":"https://arxiv.org/pdf/2309.01842v1.pdf","comment":"Accepted by BMVC 2023"},{"id":"http://arxiv.org/abs/2211.08615v7","updated":"2023-09-04T22:28:46Z","published":"2022-11-16T02:03:20Z","title":"GLFF: Global and Local Feature Fusion for AI-synthesized Image Detection","summary":" With the rapid development of deep generative models (such as Generative\nAdversarial Networks and Diffusion models), AI-synthesized images are now of\nsuch high quality that humans can hardly distinguish them from pristine ones.\nAlthough existing detection methods have shown high performance in specific\nevaluation settings, e.g., on images from seen models or on images without\nreal-world post-processing, they tend to suffer serious performance degradation\nin real-world scenarios where testing images can be generated by more powerful\ngeneration models or combined with various post-processing operations. To\naddress this issue, we propose a Global and Local Feature Fusion (GLFF)\nframework to learn rich and discriminative representations by combining\nmulti-scale global features from the whole image with refined local features\nfrom informative patches for AI synthesized image detection. GLFF fuses\ninformation from two branches: the global branch to extract multi-scale\nsemantic features and the local branch to select informative patches for\ndetailed local artifacts extraction. Due to the lack of a synthesized image\ndataset simulating real-world applications for evaluation, we further create a\nchallenging fake image dataset, named DeepFakeFaceForensics (DF 3 ), which\ncontains 6 state-of-the-art generation models and a variety of post-processing\ntechniques to approach the real-world scenarios. Experimental results\ndemonstrate the superiority of our method to the state-of-the-art methods on\nthe proposed DF 3 dataset and three other open-source datasets.\n","authors":["Yan Ju","Shan Jia","Jialing Cai","Haiying Guan","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2211.08615v7.pdf","comment":"13 pages, 6 figures, 8 tables"},{"id":"http://arxiv.org/abs/2309.01824v1","updated":"2023-09-04T21:26:26Z","published":"2023-09-04T21:26:26Z","title":"On the fly Deep Neural Network Optimization Control for Low-Power\n Computer Vision","summary":" Processing visual data on mobile devices has many applications, e.g.,\nemergency response and tracking. State-of-the-art computer vision techniques\nrely on large Deep Neural Networks (DNNs) that are usually too power-hungry to\nbe deployed on resource-constrained edge devices. Many techniques improve the\nefficiency of DNNs by using sparsity or quantization. However, the accuracy and\nefficiency of these techniques cannot be adapted for diverse edge applications\nwith different hardware constraints and accuracy requirements. This paper\npresents a novel technique to allow DNNs to adapt their accuracy and energy\nconsumption during run-time, without the need for any re-training. Our\ntechnique called AdaptiveActivation introduces a hyper-parameter that controls\nthe output range of the DNNs' activation function to dynamically adjust the\nsparsity and precision in the DNN. AdaptiveActivation can be applied to any\nexisting pre-trained DNN to improve their deployability in diverse edge\nenvironments. We conduct experiments on popular edge devices and show that the\naccuracy is within 1.5% of the baseline. We also show that our approach\nrequires 10%--38% less memory than the baseline techniques leading to more\naccuracy-efficiency tradeoff options\n","authors":["Ishmeet Kaur","Adwaita Janardhan Jadhav"],"pdf_url":"https://arxiv.org/pdf/2309.01824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01823v1","updated":"2023-09-04T21:24:00Z","published":"2023-09-04T21:24:00Z","title":"Multi-dimension unified Swin Transformer for 3D Lesion Segmentation in\n Multiple Anatomical Locations","summary":" In oncology research, accurate 3D segmentation of lesions from CT scans is\nessential for the modeling of lesion growth kinetics. However, following the\nRECIST criteria, radiologists routinely only delineate each lesion on the axial\nslice showing the largest transverse area, and delineate a small number of\nlesions in 3D for research purposes. As a result, we have plenty of unlabeled\n3D volumes and labeled 2D images, and scarce labeled 3D volumes, which makes\ntraining a deep-learning 3D segmentation model a challenging task. In this\nwork, we propose a novel model, denoted a multi-dimension unified Swin\ntransformer (MDU-ST), for 3D lesion segmentation. The MDU-ST consists of a\nShifted-window transformer (Swin-transformer) encoder and a convolutional\nneural network (CNN) decoder, allowing it to adapt to 2D and 3D inputs and\nlearn the corresponding semantic information in the same encoder. Based on this\nmodel, we introduce a three-stage framework: 1) leveraging large amount of\nunlabeled 3D lesion volumes through self-supervised pretext tasks to learn the\nunderlying pattern of lesion anatomy in the Swin-transformer encoder; 2)\nfine-tune the Swin-transformer encoder to perform 2D lesion segmentation with\n2D RECIST slices to learn slice-level segmentation information; 3) further\nfine-tune the Swin-transformer encoder to perform 3D lesion segmentation with\nlabeled 3D volumes. The network's performance is evaluated by the Dice\nsimilarity coefficient (DSC) and Hausdorff distance (HD) using an internal 3D\nlesion dataset with 593 lesions extracted from multiple anatomical locations.\nThe proposed MDU-ST demonstrates significant improvement over the competing\nmodels. The proposed method can be used to conduct automated 3D lesion\nsegmentation to assist radiomics and tumor growth modeling studies. This paper\nhas been accepted by the IEEE International Symposium on Biomedical Imaging\n(ISBI) 2023.\n","authors":["Shaoyan Pan","Yiqiao Liu","Sarah Halek","Michal Tomaszewski","Shubing Wang","Richard Baumgartner","Jianda Yuan","Gregory Goldmacher","Antong Chen"],"pdf_url":"https://arxiv.org/pdf/2309.01823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01811v1","updated":"2023-09-04T21:01:55Z","published":"2023-09-04T21:01:55Z","title":"Instant Continual Learning of Neural Radiance Fields","summary":" Neural radiance fields (NeRFs) have emerged as an effective method for\nnovel-view synthesis and 3D scene reconstruction. However, conventional\ntraining methods require access to all training views during scene\noptimization. This assumption may be prohibitive in continual learning\nscenarios, where new data is acquired in a sequential manner and a continuous\nupdate of the NeRF is desired, as in automotive or remote sensing applications.\nWhen naively trained in such a continual setting, traditional scene\nrepresentation frameworks suffer from catastrophic forgetting, where previously\nlearned knowledge is corrupted after training on new data. Prior works in\nalleviating forgetting with NeRFs suffer from low reconstruction quality and\nhigh latency, making them impractical for real-world application. We propose a\ncontinual learning framework for training NeRFs that leverages replay-based\nmethods combined with a hybrid explicit--implicit scene representation. Our\nmethod outperforms previous methods in reconstruction quality when trained in a\ncontinual setting, while having the additional benefit of being an order of\nmagnitude faster.\n","authors":["Ryan Po","Zhengyang Dong","Alexander W. Bergman","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2309.01811v1.pdf","comment":"For project page"},{"id":"http://arxiv.org/abs/2309.01797v1","updated":"2023-09-04T20:23:57Z","published":"2023-09-04T20:23:57Z","title":"Accuracy and Consistency of Space-based Vegetation Height Maps for\n Forest Dynamics in Alpine Terrain","summary":" Monitoring and understanding forest dynamics is essential for environmental\nconservation and management. This is why the Swiss National Forest Inventory\n(NFI) provides countrywide vegetation height maps at a spatial resolution of\n0.5 m. Its long update time of 6 years, however, limits the temporal analysis\nof forest dynamics. This can be improved by using spaceborne remote sensing and\ndeep learning to generate large-scale vegetation height maps in a\ncost-effective way. In this paper, we present an in-depth analysis of these\nmethods for operational application in Switzerland. We generate annual,\ncountrywide vegetation height maps at a 10-meter ground sampling distance for\nthe years 2017 to 2020 based on Sentinel-2 satellite imagery. In comparison to\nprevious works, we conduct a large-scale and detailed stratified analysis\nagainst a precise Airborne Laser Scanning reference dataset. This stratified\nanalysis reveals a close relationship between the model accuracy and the\ntopology, especially slope and aspect. We assess the potential of deep\nlearning-derived height maps for change detection and find that these maps can\nindicate changes as small as 250 $m^2$. Larger-scale changes caused by a winter\nstorm are detected with an F1-score of 0.77. Our results demonstrate that\nvegetation height maps computed from satellite imagery with deep learning are a\nvaluable, complementary, cost-effective source of evidence to increase the\ntemporal resolution for national forest assessments.\n","authors":["Yuchang Jiang","Marius Rüetschi","Vivien Sainte Fare Garnot","Mauro Marty","Konrad Schindler","Christian Ginzler","Jan D. Wegner"],"pdf_url":"https://arxiv.org/pdf/2309.01797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01793v1","updated":"2023-09-04T20:10:38Z","published":"2023-09-04T20:10:38Z","title":"Neural-Singular-Hessian: Implicit Neural Representation of Unoriented\n Point Clouds by Enforcing Singular Hessian","summary":" Neural implicit representation is a promising approach for reconstructing\nsurfaces from point clouds. Existing methods combine various regularization\nterms, such as the Eikonal and Laplacian energy terms, to enforce the learned\nneural function to possess the properties of a Signed Distance Function (SDF).\nHowever, inferring the actual topology and geometry of the underlying surface\nfrom poor-quality unoriented point clouds remains challenging. In accordance\nwith Differential Geometry, the Hessian of the SDF is singular for points\nwithin the differential thin-shell space surrounding the surface. Our approach\nenforces the Hessian of the neural implicit function to have a zero determinant\nfor points near the surface. This technique aligns the gradients for a\nnear-surface point and its on-surface projection point, producing a rough but\nfaithful shape within just a few iterations. By annealing the weight of the\nsingular-Hessian term, our approach ultimately produces a high-fidelity\nreconstruction result. Extensive experimental results demonstrate that our\napproach effectively suppresses ghost geometry and recovers details from\nunoriented point clouds with better expressiveness than existing fitting-based\nmethods.\n","authors":["Zixiong Wang","Yunxiao Zhang","Rui Xu","Fan Zhang","Pengshuai Wang","Shuangmin Chen","Shiqing Xin","Wenping Wang","Changhe Tu"],"pdf_url":"https://arxiv.org/pdf/2309.01793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01786v1","updated":"2023-09-04T19:58:35Z","published":"2023-09-04T19:58:35Z","title":"Safe and Robust Watermark Injection with a Single OoD Image","summary":" Training a high-performance deep neural network requires large amounts of\ndata and computational resources. Protecting the intellectual property (IP) and\ncommercial ownership of a deep model is challenging yet increasingly crucial. A\nmajor stream of watermarking strategies implants verifiable backdoor triggers\nby poisoning training samples, but these are often unrealistic due to data\nprivacy and safety concerns and are vulnerable to minor model changes such as\nfine-tuning. To overcome these challenges, we propose a safe and robust\nbackdoor-based watermark injection technique that leverages the diverse\nknowledge from a single out-of-distribution (OoD) image, which serves as a\nsecret key for IP verification. The independence of training data makes it\nagnostic to third-party promises of IP security. We induce robustness via\nrandom perturbation of model parameters during watermark injection to defend\nagainst common watermark removal attacks, including fine-tuning, pruning, and\nmodel extraction. Our experimental results demonstrate that the proposed\nwatermarking approach is not only time- and sample-efficient without training\ndata, but also robust against the watermark removal attacks above.\n","authors":["Shuyang Yu","Junyuan Hong","Haobo Zhang","Haotao Wang","Zhangyang Wang","Jiayu Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.01786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.04391v3","updated":"2023-09-04T19:54:59Z","published":"2021-04-09T14:30:35Z","title":"Flow-based Spatio-Temporal Structured Prediction of Motion Dynamics","summary":" Conditional Normalizing Flows (CNFs) are flexible generative models capable\nof representing complicated distributions with high dimensionality and large\ninterdimensional correlations, making them appealing for structured output\nlearning. Their effectiveness in modelling multivariates spatio-temporal\nstructured data has yet to be completely investigated. We propose MotionFlow as\na novel normalizing flows approach that autoregressively conditions the output\ndistributions on the spatio-temporal input features. It combines deterministic\nand stochastic representations with CNFs to create a probabilistic neural\ngenerative approach that can model the variability seen in high dimensional\nstructured spatio-temporal data. We specifically propose to use conditional\npriors to factorize the latent space for the time dependent modeling. We also\nexploit the use of masked convolutions as autoregressive conditionals in CNFs.\nAs a result, our method is able to define arbitrarily expressive output\nprobability distributions under temporal dynamics in multivariate prediction\ntasks. We apply our method to different tasks, including trajectory prediction,\nmotion prediction, time series forecasting, and binary segmentation, and\ndemonstrate that our model is able to leverage normalizing flows to learn\ncomplicated time dependent conditional distributions.\n","authors":["Mohsen Zand","Ali Etemad","Michael Greenspan"],"pdf_url":"https://arxiv.org/pdf/2104.04391v3.pdf","comment":"13 pages, LaTeX; typos corrected, updated, in IEEE Transactions on\n Pattern Analysis and Machine Intelligence"},{"id":"http://arxiv.org/abs/2309.01782v1","updated":"2023-09-04T19:48:17Z","published":"2023-09-04T19:48:17Z","title":"3D View Prediction Models of the Dorsal Visual Stream","summary":" Deep neural network representations align well with brain activity in the\nventral visual stream. However, the primate visual system has a distinct dorsal\nprocessing stream with different functional properties. To test if a model\ntrained to perceive 3D scene geometry aligns better with neural responses in\ndorsal visual areas, we trained a self-supervised geometry-aware recurrent\nneural network (GRNN) to predict novel camera views using a 3D feature memory.\nWe compared GRNN to self-supervised baseline models that have been shown to\nalign well with ventral regions using the large-scale fMRI Natural Scenes\nDataset (NSD). We found that while the baseline models accounted better for\nventral brain regions, GRNN accounted for a greater proportion of variance in\ndorsal brain regions. Our findings demonstrate the potential for using\ntask-relevant models to probe representational differences across visual\nstreams.\n","authors":["Gabriel Sarch","Hsiao-Yu Fish Tung","Aria Wang","Jacob Prince","Michael Tarr"],"pdf_url":"https://arxiv.org/pdf/2309.01782v1.pdf","comment":"2023 Conference on Cognitive Computational Neuroscience"},{"id":"http://arxiv.org/abs/2309.01770v1","updated":"2023-09-04T19:16:46Z","published":"2023-09-04T19:16:46Z","title":"StyleAdapter: A Single-Pass LoRA-Free Model for Stylized Image\n Generation","summary":" This paper presents a LoRA-free method for stylized image generation that\ntakes a text prompt and style reference images as inputs and produces an output\nimage in a single pass. Unlike existing methods that rely on training a\nseparate LoRA for each style, our method can adapt to various styles with a\nunified model. However, this poses two challenges: 1) the prompt loses\ncontrollability over the generated content, and 2) the output image inherits\nboth the semantic and style features of the style reference image, compromising\nits content fidelity. To address these challenges, we introduce StyleAdapter, a\nmodel that comprises two components: a two-path cross-attention module (TPCA)\nand three decoupling strategies. These components enable our model to process\nthe prompt and style reference features separately and reduce the strong\ncoupling between the semantic and style information in the style references.\nStyleAdapter can generate high-quality images that match the content of the\nprompts and adopt the style of the references (even for unseen styles) in a\nsingle pass, which is more flexible and efficient than previous methods.\nExperiments have been conducted to demonstrate the superiority of our method\nover previous works.\n","authors":["Zhouxia Wang","Xintao Wang","Liangbin Xie","Zhongang Qi","Ying Shan","Wenping Wang","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2309.01770v1.pdf","comment":"AIGC"},{"id":"http://arxiv.org/abs/2212.01381v2","updated":"2023-09-04T19:12:46Z","published":"2022-12-02T18:59:51Z","title":"LatentSwap3D: Semantic Edits on 3D Image GANs","summary":" 3D GANs have the ability to generate latent codes for entire 3D volumes\nrather than only 2D images. These models offer desirable features like\nhigh-quality geometry and multi-view consistency, but, unlike their 2D\ncounterparts, complex semantic image editing tasks for 3D GANs have only been\npartially explored. To address this problem, we propose LatentSwap3D, a\nsemantic edit approach based on latent space discovery that can be used with\nany off-the-shelf 3D or 2D GAN model and on any dataset. LatentSwap3D relies on\nidentifying the latent code dimensions corresponding to specific attributes by\nfeature ranking using a random forest classifier. It then performs the edit by\nswapping the selected dimensions of the image being edited with the ones from\nan automatically selected reference image. Compared to other latent space\ncontrol-based edit methods, which were mainly designed for 2D GANs, our method\non 3D GANs provides remarkably consistent semantic edits in a disentangled\nmanner and outperforms others both qualitatively and quantitatively. We show\nresults on seven 3D GANs (pi-GAN, GIRAFFE, StyleSDF, MVCGAN, EG3D, StyleNeRF,\nand VolumeGAN) and on five datasets (FFHQ, AFHQ, Cats, MetFaces, and CompCars).\n","authors":["Enis Simsar","Alessio Tonioni","Evin Pınar Örnek","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2212.01381v2.pdf","comment":"The paper has been accepted by ICCV'23 AI3DCC"},{"id":"http://arxiv.org/abs/2309.01765v1","updated":"2023-09-04T18:54:56Z","published":"2023-09-04T18:54:56Z","title":"BLiSS: Bootstrapped Linear Shape Space","summary":" Morphable models are fundamental to numerous human-centered processes as they\noffer a simple yet expressive shape space. Creating such morphable models,\nhowever, is both tedious and expensive. The main challenge is establishing\ndense correspondences across raw scans that capture sufficient shape variation.\nThis is often addressed using a mix of significant manual intervention and\nnon-rigid registration. We observe that creating a shape space and solving for\ndense correspondence are tightly coupled -- while dense correspondence is\nneeded to build shape spaces, an expressive shape space provides a reduced\ndimensional space to regularize the search. We introduce BLiSS, a method to\nsolve both progressively. Starting from a small set of manually registered\nscans to bootstrap the process, we enrich the shape space and then use that to\nget new unregistered scans into correspondence automatically. The critical\ncomponent of BLiSS is a non-linear deformation model that captures details\nmissed by the low-dimensional shape space, thus allowing progressive enrichment\nof the space.\n","authors":["Sanjeev Muralikrishnan","Chun-Hao Paul Huang","Duygu Ceylan","Niloy J. Mitra"],"pdf_url":"https://arxiv.org/pdf/2309.01765v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2301.07283v3","updated":"2023-09-04T18:38:51Z","published":"2023-01-18T03:14:14Z","title":"Contrastive Learning for Self-Supervised Pre-Training of Point Cloud\n Segmentation Networks With Image Data","summary":" Reducing the quantity of annotations required for supervised training is\nvital when labels are scarce and costly. This reduction is particularly\nimportant for semantic segmentation tasks involving 3D datasets, which are\noften significantly smaller and more challenging to annotate than their\nimage-based counterparts. Self-supervised pre-training on unlabelled data is\none way to reduce the amount of manual annotations needed. Previous work has\nfocused on pre-training with point clouds exclusively. While useful, this\napproach often requires two or more registered views. In the present work, we\ncombine image and point cloud modalities by first learning self-supervised\nimage features and then using these features to train a 3D model. By\nincorporating image data, which is often included in many 3D datasets, our\npre-training method only requires a single scan of a scene and can be applied\nto cases where localization information is unavailable. We demonstrate that our\npre-training approach, despite using single scans, achieves comparable\nperformance to other multi-scan, point cloud-only methods.\n","authors":["Andrej Janda","Brandon Wagstaff","Edwin G. Ng","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2301.07283v3.pdf","comment":"In Proceedings of the Conference on Robots and Vision (CRV'23),\n Montreal, Canada, Jun. 6-8, 2023. arXiv admin note: substantial text overlap\n with arXiv:2211.11801"},{"id":"http://arxiv.org/abs/2305.04334v2","updated":"2023-09-04T18:34:43Z","published":"2023-05-07T17:07:11Z","title":"Living in a Material World: Learning Material Properties from\n Full-Waveform Flash Lidar Data for Semantic Segmentation","summary":" Advances in lidar technology have made the collection of 3D point clouds fast\nand easy. While most lidar sensors return per-point intensity (or reflectance)\nvalues along with range measurements, flash lidar sensors are able to provide\ninformation about the shape of the return pulse. The shape of the return\nwaveform is affected by many factors, including the distance that the light\npulse travels and the angle of incidence with a surface. Importantly, the shape\nof the return waveform also depends on the material properties of the\nreflecting surface. In this paper, we investigate whether the material type or\nclass can be determined from the full-waveform response. First, as a proof of\nconcept, we demonstrate that the extra information about material class, if\nknown accurately, can improve performance on scene understanding tasks such as\nsemantic segmentation. Next, we learn two different full-waveform material\nclassifiers: a random forest classifier and a temporal convolutional neural\nnetwork (TCN) classifier. We find that, in some cases, material types can be\ndistinguished, and that the TCN generally performs better across a wider range\nof materials. However, factors such as angle of incidence, material colour, and\nmaterial similarity may hinder overall performance.\n","authors":["Andrej Janda","Pierre Merriaux","Pierre Olivier","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2305.04334v2.pdf","comment":"In Proceedings of the Conference on Robots and Vision (CRV'23),\n Montreal, Canada, Jun. 6-8, 2023"},{"id":"http://arxiv.org/abs/2305.08673v2","updated":"2023-09-04T18:32:25Z","published":"2023-05-15T14:28:34Z","title":"aUToLights: A Robust Multi-Camera Traffic Light Detection and Tracking\n System","summary":" Following four successful years in the SAE AutoDrive Challenge Series I, the\nUniversity of Toronto is participating in the Series II competition to develop\na Level 4 autonomous passenger vehicle capable of handling various urban\ndriving scenarios by 2025. Accurate detection of traffic lights and correct\nidentification of their states is essential for safe autonomous operation in\ncities. Herein, we describe our recently-redesigned traffic light perception\nsystem for autonomous vehicles like the University of Toronto's self-driving\ncar, Artemis. Similar to most traffic light perception systems, we rely\nprimarily on camera-based object detectors. We deploy the YOLOv5 detector for\nbounding box regression and traffic light classification across multiple\ncameras and fuse the observations. To improve robustness, we incorporate priors\nfrom high-definition semantic maps and perform state filtering using hidden\nMarkov models. We demonstrate a multi-camera, real time-capable traffic light\nperception pipeline that handles complex situations including multiple visible\nintersections, traffic light variations, temporary occlusion, and flashing\nlight states. To validate our system, we collected and annotated a varied\ndataset incorporating flashing states and a range of occlusion types. Our\nresults show superior performance in challenging real-world scenarios compared\nto single-frame, single-camera object detection.\n","authors":["Sean Wu","Nicole Amenta","Jiachen Zhou","Sandro Papais","Jonathan Kelly"],"pdf_url":"https://arxiv.org/pdf/2305.08673v2.pdf","comment":"In Proceedings of the Conference on Robots and Vision (CRV'23),\n Montreal, Canada, Jun. 6-8, 2023"},{"id":"http://arxiv.org/abs/2309.01751v1","updated":"2023-09-04T18:19:20Z","published":"2023-09-04T18:19:20Z","title":"Multispectral Indices for Wildfire Management","summary":" This paper highlights and summarizes the most important multispectral indices\nand associated methodologies for fire management. Various fields of study are\nexamined where multispectral indices align with wildfire prevention and\nmanagement, including vegetation and soil attribute extraction, water feature\nmapping, artificial structure identification, and post-fire burnt area\nestimation. The versatility and effectiveness of multispectral indices in\naddressing specific issues in wildfire management are emphasized. Fundamental\ninsights for optimizing data extraction are presented. Concrete indices for\neach task, including the NDVI and the NDWI, are suggested. Moreover, to enhance\naccuracy and address inherent limitations of individual index applications, the\nintegration of complementary processing solutions and additional data sources\nlike high-resolution imagery and ground-based measurements is recommended. This\npaper aims to be an immediate and comprehensive reference for researchers and\nstakeholders working on multispectral indices related to the prevention and\nmanagement of fires.\n","authors":["Afonso Oliveira","João P. Matos-Carvalho","Filipe Moutinho","Nuno Fachada"],"pdf_url":"https://arxiv.org/pdf/2309.01751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11910v4","updated":"2023-09-04T18:17:27Z","published":"2023-03-21T15:01:02Z","title":"360BEV: Panoramic Semantic Mapping for Indoor Bird's-Eye View","summary":" Seeing only a tiny part of the whole is not knowing the full circumstance.\nBird's-eye-view (BEV) perception, a process of obtaining allocentric maps from\negocentric views, is restricted when using a narrow Field of View (FoV) alone.\nIn this work, mapping from 360{\\deg} panoramas to BEV semantics, the 360BEV\ntask, is established for the first time to achieve holistic representations of\nindoor scenes in a top-down view. Instead of relying on narrow-FoV image\nsequences, a panoramic image with depth information is sufficient to generate a\nholistic BEV semantic map. To benchmark 360BEV, we present two indoor datasets,\n360BEV-Matterport and 360BEV-Stanford, both of which include egocentric\npanoramic images and semantic segmentation labels, as well as allocentric\nsemantic maps. Besides delving deep into different mapping paradigms, we\npropose a dedicated solution for panoramic semantic mapping, namely 360Mapper.\nThrough extensive experiments, our methods achieve 44.32% and 45.78% in mIoU on\nboth datasets respectively, surpassing previous counterparts with gains of\n+7.60% and +9.70% in mIoU. Code and datasets are available at the project page:\nhttps://jamycheung.github.io/360BEV.html.\n","authors":["Zhifeng Teng","Jiaming Zhang","Kailun Yang","Kunyu Peng","Hao Shi","Simon Reiß","Ke Cao","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2303.11910v4.pdf","comment":"Code and datasets are available at the project page:\n https://jamycheung.github.io/360BEV.html. Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2309.00616v2","updated":"2023-09-04T17:59:54Z","published":"2023-09-01T17:59:56Z","title":"OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation","summary":" Current 3D open-vocabulary scene understanding methods mostly utilize\nwell-aligned 2D images as the bridge to learn 3D features with language.\nHowever, applying these approaches becomes challenging in scenarios where 2D\nimages are absent. In this work, we introduce a completely new pipeline,\nnamely, OpenIns3D, which requires no 2D image inputs, for 3D open-vocabulary\nscene understanding at the instance level. The OpenIns3D framework employs a\n\"Mask-Snap-Lookup\" scheme. The \"Mask\" module learns class-agnostic mask\nproposals in 3D point clouds. The \"Snap\" module generates synthetic scene-level\nimages at multiple scales and leverages 2D vision language models to extract\ninteresting objects. The \"Lookup\" module searches through the outcomes of\n\"Snap\" with the help of Mask2Pixel maps, which contain the precise\ncorrespondence between 3D masks and synthetic images, to assign category names\nto the proposed masks. This 2D input-free, easy-to-train, and flexible approach\nachieved state-of-the-art results on a wide range of indoor and outdoor\ndatasets with a large margin. Furthermore, OpenIns3D allows for effortless\nswitching of 2D detectors without re-training. When integrated with\nstate-of-the-art 2D open-world models such as ODISE and GroundingDINO, superb\nresults are observed on open-vocabulary instance segmentation. When integrated\nwith LLM-powered 2D models like LISA, it demonstrates a remarkable capacity to\nprocess highly complex text queries, including those that require intricate\nreasoning and world knowledge. Project page:\nhttps://zheninghuang.github.io/OpenIns3D/\n","authors":["Zhening Huang","Xiaoyang Wu","Xi Chen","Hengshuang Zhao","Lei Zhu","Joan Lasenby"],"pdf_url":"https://arxiv.org/pdf/2309.00616v2.pdf","comment":"24 pages, 16 figures, 13 tables. Project page:\n https://zheninghuang.github.io/OpenIns3D/"},{"id":"http://arxiv.org/abs/2309.01740v1","updated":"2023-09-04T17:58:01Z","published":"2023-09-04T17:58:01Z","title":"An Empirical Analysis for Zero-Shot Multi-Label Classification on\n COVID-19 CT Scans and Uncurated Reports","summary":" The pandemic resulted in vast repositories of unstructured data, including\nradiology reports, due to increased medical examinations. Previous research on\nautomated diagnosis of COVID-19 primarily focuses on X-ray images, despite\ntheir lower precision compared to computed tomography (CT) scans. In this work,\nwe leverage unstructured data from a hospital and harness the fine-grained\ndetails offered by CT scans to perform zero-shot multi-label classification\nbased on contrastive visual language learning. In collaboration with human\nexperts, we investigate the effectiveness of multiple zero-shot models that aid\nradiologists in detecting pulmonary embolisms and identifying intricate lung\ndetails like ground glass opacities and consolidations. Our empirical analysis\nprovides an overview of the possible solutions to target such fine-grained\ntasks, so far overlooked in the medical multimodal pretraining literature. Our\ninvestigation promises future advancements in the medical image analysis\ncommunity by addressing some challenges associated with unstructured data and\nfine-grained multi-label classification.\n","authors":["Ethan Dack","Lorenzo Brigato","Matthew McMurray","Matthias Fontanellaz","Thomas Frauenfelder","Hanno Hoppe","Aristomenis Exadaktylos","Thomas Geiser","Manuela Funke-Chambour","Andreas Christe","Lukas Ebner","Stavroula Mougiakakou"],"pdf_url":"https://arxiv.org/pdf/2309.01740v1.pdf","comment":"10 pages, 3 figures, Proceedings of the IEEE/CVF International\n Conference on Computer Vision (ICCV) Workshops 2023"},{"id":"http://arxiv.org/abs/2309.01729v1","updated":"2023-09-04T17:29:31Z","published":"2023-09-04T17:29:31Z","title":"Softmax Bias Correction for Quantized Generative Models","summary":" Post-training quantization (PTQ) is the go-to compression technique for large\ngenerative models, such as stable diffusion or large language models. PTQ\nmethods commonly keep the softmax activation in higher precision as it has been\nshown to be very sensitive to quantization noise. However, this can lead to a\nsignificant runtime and power overhead during inference on resource-constraint\nedge devices. In this work, we investigate the source of the softmax\nsensitivity to quantization and show that the quantization operation leads to a\nlarge bias in the softmax output, causing accuracy degradation. To overcome\nthis issue, we propose an offline bias correction technique that improves the\nquantizability of softmax without additional compute during deployment, as it\ncan be readily absorbed into the quantization parameters. We demonstrate the\neffectiveness of our method on stable diffusion v1.5 and 125M-size OPT language\nmodel, achieving significant accuracy improvement for 8-bit quantized softmax.\n","authors":["Nilesh Prasad Pandey","Marios Fournarakis","Chirag Patel","Markus Nagel"],"pdf_url":"https://arxiv.org/pdf/2309.01729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01728v1","updated":"2023-09-04T17:22:10Z","published":"2023-09-04T17:22:10Z","title":"Generative-based Fusion Mechanism for Multi-Modal Tracking","summary":" Generative models (GMs) have received increasing research interest for their\nremarkable capacity to achieve comprehensive understanding. However, their\npotential application in the domain of multi-modal tracking has remained\nrelatively unexplored. In this context, we seek to uncover the potential of\nharnessing generative techniques to address the critical challenge, information\nfusion, in multi-modal tracking. In this paper, we delve into two prominent GM\ntechniques, namely, Conditional Generative Adversarial Networks (CGANs) and\nDiffusion Models (DMs). Different from the standard fusion process where the\nfeatures from each modality are directly fed into the fusion block, we\ncondition these multi-modal features with random noise in the GM framework,\neffectively transforming the original training samples into harder instances.\nThis design excels at extracting discriminative clues from the features,\nenhancing the ultimate tracking performance. To quantitatively gauge the\neffectiveness of our approach, we conduct extensive experiments across two\nmulti-modal tracking tasks, three baseline methods, and three challenging\nbenchmarks. The experimental results demonstrate that the proposed\ngenerative-based fusion mechanism achieves state-of-the-art performance,\nsetting new records on LasHeR and RGBD1K.\n","authors":["Zhangyong Tang","Tianyang Xu","Xuefeng Zhu","Xiao-Jun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2309.01728v1.pdf","comment":"10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2309.01723v1","updated":"2023-09-04T17:13:06Z","published":"2023-09-04T17:13:06Z","title":"SAF-IS: a Spatial Annotation Free Framework for Instance Segmentation of\n Surgical Tools","summary":" Instance segmentation of surgical instruments is a long-standing research\nproblem, crucial for the development of many applications for computer-assisted\nsurgery. This problem is commonly tackled via fully-supervised training of deep\nlearning models, requiring expensive pixel-level annotations to train. In this\nwork, we develop a framework for instance segmentation not relying on spatial\nannotations for training. Instead, our solution only requires binary tool\nmasks, obtainable using recent unsupervised approaches, and binary tool\npresence labels, freely obtainable in robot-assisted surgery. Based on the\nbinary mask information, our solution learns to extract individual tool\ninstances from single frames, and to encode each instance into a compact vector\nrepresentation, capturing its semantic features. Such representations guide the\nautomatic selection of a tiny number of instances (8 only in our experiments),\ndisplayed to a human operator for tool-type labelling. The gathered information\nis finally used to match each training instance with a binary tool presence\nlabel, providing an effective supervision signal to train a tool instance\nclassifier. We validate our framework on the EndoVis 2017 and 2018 segmentation\ndatasets. We provide results using binary masks obtained either by manual\nannotation or as predictions of an unsupervised binary segmentation model. The\nlatter solution yields an instance segmentation approach completely free from\nspatial annotations, outperforming several state-of-the-art fully-supervised\nsegmentation approaches.\n","authors":["Luca Sestini","Benoit Rosa","Elena De Momi","Giancarlo Ferrigno","Nicolas Padoy"],"pdf_url":"https://arxiv.org/pdf/2309.01723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00014v2","updated":"2023-09-04T16:58:32Z","published":"2023-08-24T16:30:54Z","title":"Improving NeRF Quality by Progressive Camera Placement for Unrestricted\n Navigation in Complex Environments","summary":" Neural Radiance Fields, or NeRFs, have drastically improved novel view\nsynthesis and 3D reconstruction for rendering. NeRFs achieve impressive results\non object-centric reconstructions, but the quality of novel view synthesis with\nfree-viewpoint navigation in complex environments (rooms, houses, etc) is often\nproblematic. While algorithmic improvements play an important role in the\nresulting quality of novel view synthesis, in this work, we show that because\noptimizing a NeRF is inherently a data-driven process, good quality data play a\nfundamental role in the final quality of the reconstruction. As a consequence,\nit is critical to choose the data samples -- in this case the cameras -- in a\nway that will eventually allow the optimization to converge to a solution that\nallows free-viewpoint navigation with good quality. Our main contribution is an\nalgorithm that efficiently proposes new camera placements that improve visual\nquality with minimal assumptions. Our solution can be used with any NeRF model\nand outperforms baselines and similar work.\n","authors":["Georgios Kopanas","George Drettakis"],"pdf_url":"https://arxiv.org/pdf/2309.00014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10311v3","updated":"2023-09-04T16:36:47Z","published":"2023-06-17T10:10:15Z","title":"Efficient HDR Reconstruction From Real-World Raw Images","summary":" High dynamic range (HDR) imaging is a significant yet challenging problem due\nto the limited dynamic range of generic image sensors. Most existing\nlearning-based HDR reconstruction methods take a set of bracketed exposure sRGB\nimages to extend the dynamic range. However, they overlook the computational\nand memory inefficiencies of Image Signal Processors (ISPs) when processing a\nset of sRGB images with different exposures. Furthermore, the absence of\nlarge-scale raw-based HDR datasets limits the research on HDR imaging. In this\nwork, in a new aspect, we discover an excellent opportunity for HDR\nreconstructing directly from raw images and investigating novel neural network\nstructures that benefit the deployment of mobile devices. Meanwhile, we\nconstruct a new HDR dataset containing raw images and process to obtain sRGB\nimages and design a new model to reconstruct HDR utilizing the unique\ncharacteristics of long- and short-exposure images. Our key insights are\nthreefold: (1) a new computational raw LDR-HDR pair formation pipeline is\ndesigned to construct a real-world raw HDR dataset called RealRaw-HDR; (2) a\nlightweight-efficient HDR model, RepUNet, is developed using the structural\nreparameterization technique; (3) a plug-and-play alignment-free and\nmotion-aware short-exposure-first selection loss and a colorfulness loss are\nproposed to mitigate ghost artifacts and color cast. Extensive experiment\nresults demonstrate that our approach achieves state-of-the-art performance in\nboth visual quality and quantitative metrics.\n","authors":["Qirui Yang","Yihao Liu","Qihua Chen","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2306.10311v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01700v1","updated":"2023-09-04T16:18:49Z","published":"2023-09-04T16:18:49Z","title":"ControlMat: A Controlled Generative Approach to Material Capture","summary":" Material reconstruction from a photograph is a key component of 3D content\ncreation democratization. We propose to formulate this ill-posed problem as a\ncontrolled synthesis one, leveraging the recent progress in generative deep\nnetworks. We present ControlMat, a method which, given a single photograph with\nuncontrolled illumination as input, conditions a diffusion model to generate\nplausible, tileable, high-resolution physically-based digital materials. We\ncarefully analyze the behavior of diffusion models for multi-channel outputs,\nadapt the sampling process to fuse multi-scale information and introduce rolled\ndiffusion to enable both tileability and patched diffusion for high-resolution\noutputs. Our generative approach further permits exploration of a variety of\nmaterials which could correspond to the input image, mitigating the unknown\nlighting conditions. We show that our approach outperforms recent inference and\nlatent-space-optimization methods, and carefully validate our diffusion process\ndesign choices. Supplemental materials and additional details are available at:\nhttps://gvecchio.com/controlmat/.\n","authors":["Giuseppe Vecchio","Rosalie Martin","Arthur Roullier","Adrien Kaiser","Romain Rouffet","Valentin Deschaintre","Tamy Boubekeur"],"pdf_url":"https://arxiv.org/pdf/2309.01700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01694v1","updated":"2023-09-04T16:13:59Z","published":"2023-09-04T16:13:59Z","title":"No Data Augmentation? Alternative Regularizations for Effective Training\n on Small Datasets","summary":" Solving image classification tasks given small training datasets remains an\nopen challenge for modern computer vision. Aggressive data augmentation and\ngenerative models are among the most straightforward approaches to overcoming\nthe lack of data. However, the first fails to be agnostic to varying image\ndomains, while the latter requires additional compute and careful design. In\nthis work, we study alternative regularization strategies to push the limits of\nsupervised learning on small image classification datasets. In particular,\nalong with the model size and training schedule scaling, we employ a heuristic\nto select (semi) optimal learning rate and weight decay couples via the norm of\nmodel parameters. By training on only 1% of the original CIFAR-10 training set\n(i.e., 50 images per class) and testing on ciFAIR-10, a variant of the original\nCIFAR without duplicated images, we reach a test accuracy of 66.5%, on par with\nthe best state-of-the-art methods.\n","authors":["Lorenzo Brigato","Stavroula Mougiakakou"],"pdf_url":"https://arxiv.org/pdf/2309.01694v1.pdf","comment":"4th Visual Inductive Priors for Data-Efficient Deep Learning\n Workshop, ICCVW 2023"},{"id":"http://arxiv.org/abs/2309.01692v1","updated":"2023-09-04T16:09:28Z","published":"2023-09-04T16:09:28Z","title":"Mask-Attention-Free Transformer for 3D Instance Segmentation","summary":" Recently, transformer-based methods have dominated 3D instance segmentation,\nwhere mask attention is commonly involved. Specifically, object queries are\nguided by the initial instance masks in the first cross-attention, and then\niteratively refine themselves in a similar manner. However, we observe that the\nmask-attention pipeline usually leads to slow convergence due to low-recall\ninitial instance masks. Therefore, we abandon the mask attention design and\nresort to an auxiliary center regression task instead. Through center\nregression, we effectively overcome the low-recall issue and perform\ncross-attention by imposing positional prior. To reach this goal, we develop a\nseries of position-aware designs. First, we learn a spatial distribution of 3D\nlocations as the initial position queries. They spread over the 3D space\ndensely, and thus can easily capture the objects in a scene with a high recall.\nMoreover, we present relative position encoding for the cross-attention and\niterative refinement for more accurate position queries. Experiments show that\nour approach converges 4x faster than existing work, sets a new state of the\nart on ScanNetv2 3D instance segmentation benchmark, and also demonstrates\nsuperior performance across various datasets. Code and models are available at\nhttps://github.com/dvlab-research/Mask-Attention-Free-Transformer.\n","authors":["Xin Lai","Yuhui Yuan","Ruihang Chu","Yukang Chen","Han Hu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2309.01692v1.pdf","comment":"Accepted to ICCV 2023. Code and models are available at\n https://github.com/dvlab-research/Mask-Attention-Free-Transformer"},{"id":"http://arxiv.org/abs/2309.01682v1","updated":"2023-09-04T15:57:07Z","published":"2023-09-04T15:57:07Z","title":"Prior Knowledge Guided Network for Video Anomaly Detection","summary":" Video Anomaly Detection (VAD) involves detecting anomalous events in videos,\npresenting a significant and intricate task within intelligent video\nsurveillance. Existing studies often concentrate solely on features acquired\nfrom limited normal data, disregarding the latent prior knowledge present in\nextensive natural image datasets. To address this constraint, we propose a\nPrior Knowledge Guided Network(PKG-Net) for the VAD task. First, an\nauto-encoder network is incorporated into a teacher-student architecture to\nlearn two designated proxy tasks: future frame prediction and teacher network\nimitation, which can provide better generalization ability on unknown samples.\nSecond, knowledge distillation on proper feature blocks is also proposed to\nincrease the multi-scale detection ability of the model. In addition,\nprediction error and teacher-student feature inconsistency are combined to\nevaluate anomaly scores of inference samples more comprehensively. Experimental\nresults on three public benchmarks validate the effectiveness and accuracy of\nour method, which surpasses recent state-of-the-arts.\n","authors":["Zhewen Deng","Dongyue Chen","Shizhuo Deng"],"pdf_url":"https://arxiv.org/pdf/2309.01682v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.01674v1","updated":"2023-09-04T15:37:03Z","published":"2023-09-04T15:37:03Z","title":"Prompt me a Dataset: An investigation of text-image prompting for\n historical image dataset creation using foundation models","summary":" In this paper, we present a pipeline for image extraction from historical\ndocuments using foundation models, and evaluate text-image prompts and their\neffectiveness on humanities datasets of varying levels of complexity. The\nmotivation for this approach stems from the high interest of historians in\nvisual elements printed alongside historical texts on the one hand, and from\nthe relative lack of well-annotated datasets within the humanities when\ncompared to other domains. We propose a sequential approach that relies on\nGroundDINO and Meta's Segment-Anything-Model (SAM) to retrieve a significant\nportion of visual data from historical documents that can then be used for\ndownstream development tasks and dataset creation, as well as evaluate the\neffect of different linguistic prompts on the resulting detections.\n","authors":["Hassan El-Hajj","Matteo Valleriani"],"pdf_url":"https://arxiv.org/pdf/2309.01674v1.pdf","comment":"12 pages, 3 figures, Accepted in ICIAP2023, AI4DH workshop"},{"id":"http://arxiv.org/abs/2306.14538v4","updated":"2023-09-04T15:35:19Z","published":"2023-06-26T09:21:13Z","title":"Learnable Differencing Center for Nighttime Depth Perception","summary":" Depth completion is the task of recovering dense depth maps from sparse ones,\nusually with the help of color images. Existing image-guided methods perform\nwell on daytime depth perception self-driving benchmarks, but struggle in\nnighttime scenarios with poor visibility and complex illumination. To address\nthese challenges, we propose a simple yet effective framework called LDCNet.\nOur key idea is to use Recurrent Inter-Convolution Differencing (RICD) and\nIllumination-Affinitive Intra-Convolution Differencing (IAICD) to enhance the\nnighttime color images and reduce the negative effects of the varying\nillumination, respectively. RICD explicitly estimates global illumination by\ndifferencing two convolutions with different kernels, treating the\nsmall-kernel-convolution feature as the center of the large-kernel-convolution\nfeature in a new perspective. IAICD softly alleviates local relative light\nintensity by differencing a single convolution, where the center is dynamically\naggregated based on neighboring pixels and the estimated illumination map in\nRICD. On both nighttime depth completion and depth estimation tasks, extensive\nexperiments demonstrate the effectiveness of our LDCNet, reaching the state of\nthe art.\n","authors":["Zhiqiang Yan","Yupeng Zheng","Chongyi Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2306.14538v4.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2309.01656v1","updated":"2023-09-04T15:15:34Z","published":"2023-09-04T15:15:34Z","title":"Building Footprint Extraction in Dense Areas using Super Resolution and\n Frame Field Learning","summary":" Despite notable results on standard aerial datasets, current\nstate-of-the-arts fail to produce accurate building footprints in dense areas\ndue to challenging properties posed by these areas and limited data\navailability. In this paper, we propose a framework to address such issues in\npolygonal building extraction. First, super resolution is employed to enhance\nthe spatial resolution of aerial image, allowing for finer details to be\ncaptured. This enhanced imagery serves as input to a multitask learning module,\nwhich consists of a segmentation head and a frame field learning head to\neffectively handle the irregular building structures. Our model is supervised\nby adaptive loss weighting, enabling extraction of sharp edges and fine-grained\npolygons which is difficult due to overlapping buildings and low data quality.\nExtensive experiments on a slum area in India that mimics a dense area\ndemonstrate that our proposed approach significantly outperforms the current\nstate-of-the-art methods by a large margin.\n","authors":["Vuong Nguyen","Anh Ho","Duc-Anh Vu","Nguyen Thi Ngoc Anh","Tran Ngoc Thang"],"pdf_url":"https://arxiv.org/pdf/2309.01656v1.pdf","comment":"Accepted at The 12th International Conference on Awareness Science\n and Technology"},{"id":"http://arxiv.org/abs/2308.16890v2","updated":"2023-09-04T15:06:15Z","published":"2023-08-31T17:52:04Z","title":"TouchStone: Evaluating Vision-Language Models by Language Models","summary":" Large vision-language models (LVLMs) have recently witnessed rapid\nadvancements, exhibiting a remarkable capacity for perceiving, understanding,\nand processing visual information by connecting visual receptor with large\nlanguage models (LLMs). However, current assessments mainly focus on\nrecognizing and reasoning abilities, lacking direct evaluation of\nconversational skills and neglecting visual storytelling abilities. In this\npaper, we propose an evaluation method that uses strong LLMs as judges to\ncomprehensively evaluate the various abilities of LVLMs. Firstly, we construct\na comprehensive visual dialogue dataset TouchStone, consisting of open-world\nimages and questions, covering five major categories of abilities and 27\nsubtasks. This dataset not only covers fundamental recognition and\ncomprehension but also extends to literary creation. Secondly, by integrating\ndetailed image annotations we effectively transform the multimodal input\ncontent into a form understandable by LLMs. This enables us to employ advanced\nLLMs for directly evaluating the quality of the multimodal dialogue without\nrequiring human intervention. Through validation, we demonstrate that powerful\nLVLMs, such as GPT-4, can effectively score dialogue quality by leveraging\ntheir textual capabilities alone, aligning with human preferences. We hope our\nwork can serve as a touchstone for LVLMs' evaluation and pave the way for\nbuilding stronger LVLMs. The evaluation code is available at\nhttps://github.com/OFA-Sys/TouchStone.\n","authors":["Shuai Bai","Shusheng Yang","Jinze Bai","Peng Wang","Xingxuan Zhang","Junyang Lin","Xinggang Wang","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.16890v2.pdf","comment":"https://github.com/OFA-Sys/TouchStone"},{"id":"http://arxiv.org/abs/2307.02291v2","updated":"2023-09-04T15:03:11Z","published":"2023-07-05T13:42:31Z","title":"Focusing on what to decode and what to train: Efficient Training with\n HOI Split Decoders and Specific Target Guided DeNoising","summary":" Recent one-stage transformer-based methods achieve notable gains in the\nHuman-object Interaction Detection (HOI) task by leveraging the detection of\nDETR. However, the current methods redirect the detection target of the object\ndecoder, and the box target is not explicitly separated from the query\nembeddings, which leads to long and hard training. Furthermore, matching the\npredicted HOI instances with the ground-truth is more challenging than object\ndetection, simply adapting training strategies from the object detection makes\nthe training more difficult. To clear the ambiguity between human and object\ndetection and share the prediction burden, we propose a novel one-stage\nframework (SOV), which consists of a subject decoder, an object decoder, and a\nverb decoder. Moreover, we propose a novel Specific Target Guided (STG)\nDeNoising training strategy, which leverages learnable object and verb label\nembeddings to guide the training and accelerate the training convergence. In\naddition, for the inference part, the label-specific information is directly\nfed into the decoders by initializing the query embeddings from the learnable\nlabel embeddings. Without additional features or prior language knowledge, our\nmethod (SOV-STG) achieves higher accuracy than the state-of-the-art method in\none-third of training epochs. The code is available at this\nhttps://github.com/cjw2021/SOV-STG.\n","authors":["Junwen Chen","Yingcheng Wang","Keiji Yanai"],"pdf_url":"https://arxiv.org/pdf/2307.02291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01646v1","updated":"2023-09-04T14:54:47Z","published":"2023-09-04T14:54:47Z","title":"ReLoc-PDR: Visual Relocalization Enhanced Pedestrian Dead Reckoning via\n Graph Optimization","summary":" Accurately and reliably positioning pedestrians in satellite-denied\nconditions remains a significant challenge. Pedestrian dead reckoning (PDR) is\ncommonly employed to estimate pedestrian location using low-cost inertial\nsensor. However, PDR is susceptible to drift due to sensor noise, incorrect\nstep detection, and inaccurate stride length estimation. This work proposes\nReLoc-PDR, a fusion framework combining PDR and visual relocalization using\ngraph optimization. ReLoc-PDR leverages time-correlated visual observations and\nlearned descriptors to achieve robust positioning in visually-degraded\nenvironments. A graph optimization-based fusion mechanism with the Tukey kernel\neffectively corrects cumulative errors and mitigates the impact of abnormal\nvisual observations. Real-world experiments demonstrate that our ReLoc-PDR\nsurpasses representative methods in accuracy and robustness, achieving accurte\nand robust pedestrian positioning results using only a smartphone in\nchallenging environments such as less-textured corridors and dark nighttime\nscenarios.\n","authors":["Zongyang Chen","Xianfei Pan","Changhao Chen"],"pdf_url":"https://arxiv.org/pdf/2309.01646v1.pdf","comment":"11 pages, 14 figures"},{"id":"http://arxiv.org/abs/2306.16614v2","updated":"2023-09-04T14:54:17Z","published":"2023-06-29T01:07:12Z","title":"Group-based Robustness: A General Framework for Customized Robustness in\n the Real World","summary":" Machine-learning models are known to be vulnerable to evasion attacks that\nperturb model inputs to induce misclassifications. In this work, we identify\nreal-world scenarios where the true threat cannot be assessed accurately by\nexisting attacks. Specifically, we find that conventional metrics measuring\ntargeted and untargeted robustness do not appropriately reflect a model's\nability to withstand attacks from one set of source classes to another set of\ntarget classes. To address the shortcomings of existing methods, we formally\ndefine a new metric, termed group-based robustness, that complements existing\nmetrics and is better-suited for evaluating model performance in certain attack\nscenarios. We show empirically that group-based robustness allows us to\ndistinguish between models' vulnerability against specific threat models in\nsituations where traditional robustness metrics do not apply. Moreover, to\nmeasure group-based robustness efficiently and accurately, we 1) propose two\nloss functions and 2) identify three new attack strategies. We show empirically\nthat with comparable success rates, finding evasive samples using our new loss\nfunctions saves computation by a factor as large as the number of targeted\nclasses, and finding evasive samples using our new attack strategies saves time\nby up to 99\\% compared to brute-force search methods. Finally, we propose a\ndefense method that increases group-based robustness by up to 3.52$\\times$.\n","authors":["Weiran Lin","Keane Lucas","Neo Eyal","Lujo Bauer","Michael K. Reiter","Mahmood Sharif"],"pdf_url":"https://arxiv.org/pdf/2306.16614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01627v1","updated":"2023-09-04T14:18:00Z","published":"2023-09-04T14:18:00Z","title":"Cross-Consistent Deep Unfolding Network for Adaptive All-In-One Video\n Restoration","summary":" Existing Video Restoration (VR) methods always necessitate the individual\ndeployment of models for each adverse weather to remove diverse adverse weather\ndegradations, lacking the capability for adaptive processing of degradations.\nSuch limitation amplifies the complexity and deployment costs in practical\napplications. To overcome this deficiency, in this paper, we propose a\nCross-consistent Deep Unfolding Network (CDUN) for All-In-One VR, which enables\nthe employment of a single model to remove diverse degradations for the first\ntime. Specifically, the proposed CDUN accomplishes a novel iterative\noptimization framework, capable of restoring frames corrupted by corresponding\ndegradations according to the degradation features given in advance. To empower\nthe framework for eliminating diverse degradations, we devise a Sequence-wise\nAdaptive Degradation Estimator (SADE) to estimate degradation features for the\ninput corrupted video. By orchestrating these two cascading procedures, CDUN\nachieves adaptive processing for diverse degradation. In addition, we introduce\na window-based inter-frame fusion strategy to utilize information from more\nadjacent frames. This strategy involves the progressive stacking of temporal\nwindows in multiple iterations, effectively enlarging the temporal receptive\nfield and enabling each frame's restoration to leverage information from\ndistant frames. Extensive experiments demonstrate that the proposed method\nachieves state-of-the-art performance in All-In-One VR.\n","authors":["Yuanshuo Cheng","Mingwen Shao","Yecong Wan","Lixu Zhang","Wangmeng Zuo","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2309.01627v1.pdf","comment":"16 pages, 13 figures"},{"id":"http://arxiv.org/abs/2309.01624v1","updated":"2023-09-04T14:16:08Z","published":"2023-09-04T14:16:08Z","title":"AGG-Net: Attention Guided Gated-convolutional Network for Depth Image\n Completion","summary":" Recently, stereo vision based on lightweight RGBD cameras has been widely\nused in various fields. However, limited by the imaging principles, the\ncommonly used RGB-D cameras based on TOF, structured light, or binocular vision\nacquire some invalid data inevitably, such as weak reflection, boundary\nshadows, and artifacts, which may bring adverse impacts to the follow-up work.\nIn this paper, we propose a new model for depth image completion based on the\nAttention Guided Gated-convolutional Network (AGG-Net), through which more\naccurate and reliable depth images can be obtained from the raw depth maps and\nthe corresponding RGB images. Our model employs a UNet-like architecture which\nconsists of two parallel branches of depth and color features. In the encoding\nstage, an Attention Guided Gated-Convolution (AG-GConv) module is proposed to\nrealize the fusion of depth and color features at different scales, which can\neffectively reduce the negative impacts of invalid depth data on the\nreconstruction. In the decoding stage, an Attention Guided Skip Connection\n(AG-SC) module is presented to avoid introducing too many depth-irrelevant\nfeatures to the reconstruction. The experimental results demonstrate that our\nmethod outperforms the state-of-the-art methods on the popular benchmarks\nNYU-Depth V2, DIML, and SUN RGB-D.\n","authors":["Dongyue Chen","Tingxuan Huang","Zhimin Song","Shizhuo Deng","Tong Jia"],"pdf_url":"https://arxiv.org/pdf/2309.01624v1.pdf","comment":"9 pages, 7 figures, ICCV2023"},{"id":"http://arxiv.org/abs/2206.02194v2","updated":"2023-09-04T14:14:21Z","published":"2022-06-05T14:45:02Z","title":"FOF: Learning Fourier Occupancy Field for Monocular Real-time Human\n Reconstruction","summary":" The advent of deep learning has led to significant progress in monocular\nhuman reconstruction. However, existing representations, such as parametric\nmodels, voxel grids, meshes and implicit neural representations, have\ndifficulties achieving high-quality results and real-time speed at the same\ntime. In this paper, we propose Fourier Occupancy Field (FOF), a novel\npowerful, efficient and flexible 3D representation, for monocular real-time and\naccurate human reconstruction. The FOF represents a 3D object with a 2D field\northogonal to the view direction where at each 2D position the occupancy field\nof the object along the view direction is compactly represented with the first\nfew terms of Fourier series, which retains the topology and neighborhood\nrelation in the 2D domain. A FOF can be stored as a multi-channel image, which\nis compatible with 2D convolutional neural networks and can bridge the gap\nbetween 3D geometries and 2D images. The FOF is very flexible and extensible,\ne.g., parametric models can be easily integrated into a FOF as a prior to\ngenerate more robust results. Based on FOF, we design the first 30+FPS\nhigh-fidelity real-time monocular human reconstruction framework. We\ndemonstrate the potential of FOF on both public dataset and real captured data.\nThe code will be released for research purposes.\n","authors":["Qiao Feng","Yebin Liu","Yu-Kun Lai","Jingyu Yang","Kun Li"],"pdf_url":"https://arxiv.org/pdf/2206.02194v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.01808v1","updated":"2023-09-04T20:52:33Z","published":"2023-09-04T20:52:33Z","title":"DiscoverPath: A Knowledge Refinement and Retrieval System for\n Interdisciplinarity on Biomedical Research","summary":" The exponential growth in scholarly publications necessitates advanced tools\nfor efficient article retrieval, especially in interdisciplinary fields where\ndiverse terminologies are used to describe similar research. Traditional\nkeyword-based search engines often fall short in assisting users who may not be\nfamiliar with specific terminologies. To address this, we present a knowledge\ngraph-based paper search engine for biomedical research to enhance the user\nexperience in discovering relevant queries and articles. The system, dubbed\nDiscoverPath, employs Named Entity Recognition (NER) and part-of-speech (POS)\ntagging to extract terminologies and relationships from article abstracts to\ncreate a KG. To reduce information overload, DiscoverPath presents users with a\nfocused subgraph containing the queried entity and its neighboring nodes and\nincorporates a query recommendation system, enabling users to iteratively\nrefine their queries. The system is equipped with an accessible Graphical User\nInterface that provides an intuitive visualization of the KG, query\nrecommendations, and detailed article information, enabling efficient article\nretrieval, thus fostering interdisciplinary knowledge exploration. DiscoverPath\nis open-sourced at https://github.com/ynchuang/DiscoverPath.\n","authors":["Yu-Neng Chuang","Guanchu Wang","Chia-Yuan Chang","Kwei-Herng Lai","Daochen Zha","Ruixiang Tang","Fan Yang","Alfredo Costilla Reyes","Kaixiong Zhou","Xiaoqian Jiang","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2309.01808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01684v1","updated":"2023-09-04T15:58:43Z","published":"2023-09-04T15:58:43Z","title":"CRUISE-Screening: Living Literature Reviews Toolbox","summary":" Keeping up with research and finding related work is still a time-consuming\ntask for academics. Researchers sift through thousands of studies to identify a\nfew relevant ones. Automation techniques can help by increasing the efficiency\nand effectiveness of this task. To this end, we developed CRUISE-Screening, a\nweb-based application for conducting living literature reviews - a type of\nliterature review that is continuously updated to reflect the latest research\nin a particular field. CRUISE-Screening is connected to several search engines\nvia an API, which allows for updating the search results periodically.\nMoreover, it can facilitate the process of screening for relevant publications\nby using text classification and question answering models. CRUISE-Screening\ncan be used both by researchers conducting literature reviews and by those\nworking on automating the citation screening process to validate their\nalgorithms. The application is open-source:\nhttps://github.com/ProjectDoSSIER/cruise-screening, and a demo is available\nunder this URL: https://citation-screening.ec.tuwien.ac.at. We discuss the\nlimitations of our tool in Appendix A.\n","authors":["Wojciech Kusa","Petr Knoth","Allan Hanbury"],"pdf_url":"https://arxiv.org/pdf/2309.01684v1.pdf","comment":"Paper accepted at CIKM 2023. The arXiv version has an extra section\n about limitations in the Appendix that is not present in the ACM version"},{"id":"http://arxiv.org/abs/2309.01610v1","updated":"2023-09-04T13:49:48Z","published":"2023-09-04T13:49:48Z","title":"Fair Ranking under Disparate Uncertainty","summary":" Ranking is a ubiquitous method for focusing the attention of human evaluators\non a manageable subset of options. Its use ranges from surfacing potentially\nrelevant products on an e-commerce site to prioritizing college applications\nfor human review. While ranking can make human evaluation far more effective by\nfocusing attention on the most promising options, we argue that it can\nintroduce unfairness if the uncertainty of the underlying relevance model\ndiffers between groups of options. Unfortunately, such disparity in uncertainty\nappears widespread, since the relevance estimates for minority groups tend to\nhave higher uncertainty due to a lack of data or appropriate features. To\novercome this fairness issue, we propose Equal-Opportunity Ranking (EOR) as a\nnew fairness criterion for ranking that provably corrects for the disparity in\nuncertainty between groups. Furthermore, we present a practical algorithm for\ncomputing EOR rankings in time $O(n \\log(n))$ and prove its close approximation\nguarantee to the globally optimal solution. In a comprehensive empirical\nevaluation on synthetic data, a US Census dataset, and a real-world case study\nof Amazon search queries, we find that the algorithm reliably guarantees EOR\nfairness while providing effective rankings.\n","authors":["Richa Rastogi","Thorsten Joachims"],"pdf_url":"https://arxiv.org/pdf/2309.01610v1.pdf","comment":"A version of this paper was accepted as Spotlight (Oral) at UAI\n workshop on Epistemic in AI, 2023"},{"id":"http://arxiv.org/abs/2309.01552v1","updated":"2023-09-04T12:07:20Z","published":"2023-09-04T12:07:20Z","title":"OutRank: Speeding up AutoML-based Model Search for Large Sparse Data\n sets with Cardinality-aware Feature Ranking","summary":" The design of modern recommender systems relies on understanding which parts\nof the feature space are relevant for solving a given recommendation task.\nHowever, real-world data sets in this domain are often characterized by their\nlarge size, sparsity, and noise, making it challenging to identify meaningful\nsignals. Feature ranking represents an efficient branch of algorithms that can\nhelp address these challenges by identifying the most informative features and\nfacilitating the automated search for more compact and better-performing models\n(AutoML). We introduce OutRank, a system for versatile feature ranking and data\nquality-related anomaly detection. OutRank was built with categorical data in\nmind, utilizing a variant of mutual information that is normalized with regard\nto the noise produced by features of the same cardinality. We further extend\nthe similarity measure by incorporating information on feature similarity and\ncombined relevance. The proposed approach's feasibility is demonstrated by\nspeeding up the state-of-the-art AutoML system on a synthetic data set with no\nperformance loss. Furthermore, we considered a real-life click-through-rate\nprediction data set where it outperformed strong baselines such as random\nforest-based approaches. The proposed approach enables exploration of up to\n300% larger feature spaces compared to AutoML-only approaches, enabling faster\nsearch for better models on off-the-shelf hardware.\n","authors":["Blaž Škrlj","Blaž Mramor"],"pdf_url":"https://arxiv.org/pdf/2309.01552v1.pdf","comment":"accepted to RecSys2023"},{"id":"http://arxiv.org/abs/2305.10531v2","updated":"2023-09-04T11:33:56Z","published":"2023-05-17T19:31:36Z","title":"Iteratively Learning Representations for Unseen Entities with Inter-Rule\n Correlations","summary":" Recent work on knowledge graph completion (KGC) focused on learning\nembeddings of entities and relations in knowledge graphs. These embedding\nmethods require that all test entities are observed at training time, resulting\nin a time-consuming retraining process for out-of-knowledge-graph (OOKG)\nentities. To address this issue, current inductive knowledge embedding methods\nemploy graph neural networks (GNNs) to represent unseen entities by aggregating\ninformation of known neighbors. They face three important challenges: (i) data\nsparsity, (ii) the presence of complex patterns in knowledge graphs (e.g.,\ninter-rule correlations), and (iii) the presence of interactions among rule\nmining, rule inference, and embedding. In this paper, we propose a virtual\nneighbor network with inter-rule correlations (VNC) that consists of three\nstages: (i) rule mining, (ii) rule inference, and (iii) embedding. In the rule\nmining process, to identify complex patterns in knowledge graphs, both logic\nrules and inter-rule correlations are extracted from knowledge graphs based on\noperations over relation embeddings. To reduce data sparsity, virtual neighbors\nfor OOKG entities are predicted and assigned soft labels by optimizing a\nrule-constrained problem. We also devise an iterative framework to capture the\nunderlying relations between rule learning and embedding learning. In our\nexperiments, results on both link prediction and triple classification tasks\nshow that the proposed VNC framework achieves state-of-the-art performance on\nfour widely-used knowledge graphs. Further analysis reveals that VNC is robust\nto the proportion of unseen entities and effectively mitigates data sparsity.\n","authors":["Zihan Wang","Kai Zhao","Yongquan He","Zhumin Chen","Pengjie Ren","Maarten de Rijke","Zhaochun Ren"],"pdf_url":"https://arxiv.org/pdf/2305.10531v2.pdf","comment":"Accepted at CIKM 2023"},{"id":"http://arxiv.org/abs/2309.01453v1","updated":"2023-09-04T09:02:31Z","published":"2023-09-04T09:02:31Z","title":"Interactive Graph Convolutional Filtering","summary":" Interactive Recommender Systems (IRS) have been increasingly used in various\ndomains, including personalized article recommendation, social media, and\nonline advertising. However, IRS faces significant challenges in providing\naccurate recommendations under limited observations, especially in the context\nof interactive collaborative filtering. These problems are exacerbated by the\ncold start problem and data sparsity problem. Existing Multi-Armed Bandit\nmethods, despite their carefully designed exploration strategies, often\nstruggle to provide satisfactory results in the early stages due to the lack of\ninteraction data. Furthermore, these methods are computationally intractable\nwhen applied to non-linear models, limiting their applicability. To address\nthese challenges, we propose a novel method, the Interactive Graph\nConvolutional Filtering model. Our proposed method extends interactive\ncollaborative filtering into the graph model to enhance the performance of\ncollaborative filtering between users and items. We incorporate variational\ninference techniques to overcome the computational hurdles posed by non-linear\nmodels. Furthermore, we employ Bayesian meta-learning methods to effectively\naddress the cold-start problem and derive theoretical regret bounds for our\nproposed method, ensuring a robust performance guarantee. Extensive\nexperimental results on three real-world datasets validate our method and\ndemonstrate its superiority over existing baselines.\n","authors":["Jin Zhang","Defu Lian","Hong Xie","Yawen Li","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2309.01453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01334v3","updated":"2023-09-04T08:58:04Z","published":"2022-10-25T12:08:14Z","title":"MemoNet: Memorizing All Cross Features' Representations Efficiently via\n Multi-Hash Codebook Network for CTR Prediction","summary":" New findings in natural language processing (NLP) demonstrate that the strong\nmemorization capability contributes a lot to the success of Large Language\nModels (LLM). This inspires us to explicitly bring an independent memory\nmechanism into CTR ranking model to learn and memorize cross features'\nrepresentations. In this paper, we propose multi-Hash Codebook NETwork (HCNet)\nas the memory mechanism for efficiently learning and memorizing representations\nof cross features in CTR tasks. HCNet uses a multi-hash codebook as the main\nmemory place and the whole memory procedure consists of three phases:\nmulti-hash addressing, memory restoring, and feature shrinking. We also propose\na new CTR model named MemoNet which combines HCNet with a DNN backbone.\nExtensive experimental results on three public datasets and online test show\nthat MemoNet reaches superior performance over state-of-the-art approaches.\nBesides, MemoNet shows scaling law of large language model in NLP, which means\nwe can enlarge the size of the codebook in HCNet to sustainably obtain\nperformance gains. Our work demonstrates the importance and feasibility of\nlearning and memorizing representations of cross features, which sheds light on\na new promising research direction.\n","authors":["Pengtao Zhang","Junlin Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.01334v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01441v1","updated":"2023-09-04T08:46:10Z","published":"2023-09-04T08:46:10Z","title":"This Is a Local Domain: On Amassing Country-Code Top-Level Domains from\n Public Data","summary":" Domain lists are a key ingredient for representative censuses of the Web.\nUnfortunately, such censuses typically lack a view on domains under\ncountry-code top-level domains (ccTLDs). This introduces unwanted bias: many\ncountries have a rich local Web that remains hidden if their ccTLDs are not\nconsidered. The reason ccTLDs are rarely considered is that gaining access --\nif possible at all -- is often laborious. To tackle this, we ask: what can we\nlearn about ccTLDs from public sources? We extract domain names under ccTLDs\nfrom 6 years of public data from Certificate Transparency logs and Common\nCrawl. We compare this against ground truth for 19 ccTLDs for which we have the\nfull DNS zone. We find that public data covers 43%-80% of these ccTLDs, and\nthat coverage grows over time. By also comparing port scan data we then show\nthat these public sources reveal a significant part of the Web presence under a\nccTLD. We conclude that in the absence of full access to ccTLDs, domain names\nlearned from public sources can be a good proxy when performing Web censuses.\n","authors":["Raffaele Sommese","Roland van Rijswijk-Deij","Mattijs Jonker"],"pdf_url":"https://arxiv.org/pdf/2309.01441v1.pdf","comment":"6 pages double-column, 4 figures; submitted to ACM SIGCOMM CCR"},{"id":"http://arxiv.org/abs/2307.05074v2","updated":"2023-09-04T08:10:03Z","published":"2023-07-11T07:16:22Z","title":"Retrieval-augmented GPT-3.5-based Text-to-SQL Framework with\n Sample-aware Prompting and Dynamic Revision Chain","summary":" Text-to-SQL aims at generating SQL queries for the given natural language\nquestions and thus helping users to query databases. Prompt learning with large\nlanguage models (LLMs) has emerged as a recent approach, which designs prompts\nto lead LLMs to understand the input question and generate the corresponding\nSQL. However, it faces challenges with strict SQL syntax requirements. Existing\nwork prompts the LLMs with a list of demonstration examples (i.e. question-SQL\npairs) to generate SQL, but the fixed prompts can hardly handle the scenario\nwhere the semantic gap between the retrieved demonstration and the input\nquestion is large. In this paper, we propose a retrieval-augmented prompting\nmethod for a LLM-based Text-to-SQL framework, involving sample-aware prompting\nand a dynamic revision chain. Our approach incorporates sample-aware\ndemonstrations, which include the composition of SQL operators and fine-grained\ninformation related to the given question. To retrieve questions sharing\nsimilar intents with input questions, we propose two strategies for assisting\nretrieval. Firstly, we leverage LLMs to simplify the original questions,\nunifying the syntax and thereby clarifying the users' intentions. To generate\nexecutable and accurate SQLs without human intervention, we design a dynamic\nrevision chain which iteratively adapts fine-grained feedback from the\npreviously generated SQL. Experimental results on three Text-to-SQL benchmarks\ndemonstrate the superiority of our method over strong baseline models.\n","authors":["Chunxi Guo","Zhiliang Tian","Jintao Tang","Shasha Li","Zhihua Wen","Kaixuan Wang","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2307.05074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01395v1","updated":"2023-09-04T06:47:46Z","published":"2023-09-04T06:47:46Z","title":"AVATAR: Robust Voice Search Engine Leveraging Autoregressive Document\n Retrieval and Contrastive Learning","summary":" Voice, as input, has progressively become popular on mobiles and seems to\ntranscend almost entirely text input. Through voice, the voice search (VS)\nsystem can provide a more natural way to meet user's information needs.\nHowever, errors from the automatic speech recognition (ASR) system can be\ncatastrophic to the VS system. Building on the recent advanced lightweight\nautoregressive retrieval model, which has the potential to be deployed on\nmobiles, leading to a more secure and personal VS assistant. This paper\npresents a novel study of VS leveraging autoregressive retrieval and tackles\nthe crucial problems facing VS, viz. the performance drop caused by ASR noise,\nvia data augmentations and contrastive learning, showing how explicit and\nimplicit modeling the noise patterns can alleviate the problems. A series of\nexperiments conducted on the Open-Domain Question Answering (ODSQA) confirm our\napproach's effectiveness and robustness in relation to some strong baseline\nsystems.\n","authors":["Yi-Cheng Wang","Tzu-Ting Yang","Hsin-Wei Wang","Bi-Cheng Yan","Berlin Chen"],"pdf_url":"https://arxiv.org/pdf/2309.01395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01373v1","updated":"2023-09-04T05:50:04Z","published":"2023-09-04T05:50:04Z","title":"PreprintResolver: Improving Citation Quality by Resolving Published\n Versions of ArXiv Preprints using Literature Databases","summary":" The growing impact of preprint servers enables the rapid sharing of\ntime-sensitive research. Likewise, it is becoming increasingly difficult to\ndistinguish high-quality, peer-reviewed research from preprints. Although\npreprints are often later published in peer-reviewed journals, this information\nis often missing from preprint servers. To overcome this problem, the\nPreprintResolver was developed, which uses four literature databases (DBLP,\nSemanticScholar, OpenAlex, and CrossRef / CrossCite) to identify\npreprint-publication pairs for the arXiv preprint server. The target audience\nfocuses on, but is not limited to inexperienced researchers and students,\nespecially from the field of computer science. The tool is based on a fuzzy\nmatching of author surnames, titles, and DOIs. Experiments were performed on a\nsample of 1,000 arXiv-preprints from the research field of computer science and\nwithout any publication information. With 77.94 %, computer science is highly\naffected by missing publication information in arXiv. The results show that the\nPreprintResolver was able to resolve 603 out of 1,000 (60.3 %) arXiv-preprints\nfrom the research field of computer science and without any publication\ninformation. All four literature databases contributed to the final result. In\na manual validation, a random sample of 100 resolved preprints was checked. For\nall preprints, at least one result is plausible. For nine preprints, more than\none result was identified, three of which are partially invalid. In conclusion\nthe PreprintResolver is suitable for individual, manually reviewed requests,\nbut less suitable for bulk requests. The PreprintResolver tool\n(https://preprintresolver.eu, Available from 2023-08-01) and source code\n(https://gitlab.com/ippolis_wp3/preprint-resolver, Accessed: 2023-07-19) is\navailable online.\n","authors":["Louise Bloch","Johannes Rückert","Christoph M. Friedrich"],"pdf_url":"https://arxiv.org/pdf/2309.01373v1.pdf","comment":"Accepted for International Conference on Theory and Practice of\n Digital Libraries (TPDL 2023)"},{"id":"http://arxiv.org/abs/2309.01370v1","updated":"2023-09-04T05:36:58Z","published":"2023-09-04T05:36:58Z","title":"ReOnto: A Neuro-Symbolic Approach for Biomedical Relation Extraction","summary":" Relation Extraction (RE) is the task of extracting semantic relationships\nbetween entities in a sentence and aligning them to relations defined in a\nvocabulary, which is generally in the form of a Knowledge Graph (KG) or an\nontology. Various approaches have been proposed so far to address this task.\nHowever, applying these techniques to biomedical text often yields\nunsatisfactory results because it is hard to infer relations directly from\nsentences due to the nature of the biomedical relations. To address these\nissues, we present a novel technique called ReOnto, that makes use of neuro\nsymbolic knowledge for the RE task. ReOnto employs a graph neural network to\nacquire the sentence representation and leverages publicly accessible\nontologies as prior knowledge to identify the sentential relation between two\nentities. The approach involves extracting the relation path between the two\nentities from the ontology. We evaluate the effect of using symbolic knowledge\nfrom ontologies with graph neural networks. Experimental results on two public\nbiomedical datasets, BioRel and ADE, show that our method outperforms all the\nbaselines (approximately by 3\\%).\n","authors":["Monika Jain","Kuldeep Singh","Raghava Mutharaju"],"pdf_url":"https://arxiv.org/pdf/2309.01370v1.pdf","comment":"Accepted in ECML 2023"},{"id":"http://arxiv.org/abs/2309.01343v1","updated":"2023-09-04T04:02:04Z","published":"2023-09-04T04:02:04Z","title":"Distributional Domain-Invariant Preference Matching for Cross-Domain\n Recommendation","summary":" Learning accurate cross-domain preference mappings in the absence of\noverlapped users/items has presented a persistent challenge in Non-overlapping\nCross-domain Recommendation (NOCDR). Despite the efforts made in previous\nstudies to address NOCDR, several limitations still exist. Specifically, 1)\nwhile some approaches substitute overlapping users/items with overlapping\nbehaviors, they cannot handle NOCDR scenarios where such auxiliary information\nis unavailable; 2) often, cross-domain preference mapping is modeled by\nlearning deterministic explicit representation matchings between sampled users\nin two domains. However, this can be biased due to individual preferences and\nthus fails to incorporate preference continuity and universality of the general\npopulation. In light of this, we assume that despite the scattered nature of\nuser behaviors, there exists a consistent latent preference distribution shared\namong common people. Modeling such distributions further allows us to capture\nthe continuity in user behaviors within each domain and discover preference\ninvariance across domains. To this end, we propose a Distributional\ndomain-invariant Preference Matching method for non-overlapping Cross-Domain\nRecommendation (DPMCDR). For each domain, we hierarchically approximate a\nposterior of domain-level preference distribution with empirical evidence\nderived from user-item interactions. Next, we aim to build distributional\nimplicit matchings between the domain-level preferences of two domains. This\nprocess involves mapping them to a shared latent space and seeking a consensus\non domain-invariant preference by minimizing the distance between their\ndistributional representations therein. In this way, we can identify the\nalignment of two non-overlapping domains if they exhibit similar patterns of\ndomain-invariant preference.\n","authors":["Jing Du","Zesheng Ye","Bin Guo","Zhiwen Yu","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2309.01343v1.pdf","comment":"9 pages, 5 figures, full research paper accepted by ICDM 2023"},{"id":"http://arxiv.org/abs/2309.01335v1","updated":"2023-09-04T03:34:54Z","published":"2023-09-04T03:34:54Z","title":"In-processing User Constrained Dominant Sets for User-Oriented Fairness\n in Recommender Systems","summary":" Recommender systems are typically biased toward a small group of users,\nleading to severe unfairness in recommendation performance, i.e., User-Oriented\nFairness (UOF) issue. The existing research on UOF is limited and fails to deal\nwith the root cause of the UOF issue: the learning process between advantaged\nand disadvantaged users is unfair. To tackle this issue, we propose an\nIn-processing User Constrained Dominant Sets (In-UCDS) framework, which is a\ngeneral framework that can be applied to any backbone recommendation model to\nachieve user-oriented fairness. We split In-UCDS into two stages, i.e., the\nUCDS modeling stage and the in-processing training stage. In the UCDS modeling\nstage, for each disadvantaged user, we extract a constrained dominant set (a\nuser cluster) containing some advantaged users that are similar to it. In the\nin-processing training stage, we move the representations of disadvantaged\nusers closer to their corresponding cluster by calculating a fairness loss. By\ncombining the fairness loss with the original backbone model loss, we address\nthe UOF issue and maintain the overall recommendation performance\nsimultaneously. Comprehensive experiments on three real-world datasets\ndemonstrate that In-UCDS outperforms the state-of-the-art methods, leading to a\nfairer model with better overall recommendation performance.\n","authors":["Zhongxuan Han","Chaochao Chen","Xiaolin Zheng","Weiming Liu","Jun Wang","Wenjie Cheng","Yuyuan Li"],"pdf_url":"https://arxiv.org/pdf/2309.01335v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.01860v1","updated":"2023-09-04T23:31:29Z","published":"2023-09-04T23:31:29Z","title":"Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition\n and Translation","summary":" In this paper, we devise a mechanism for the addition of multi-modal\ninformation with an existing pipeline for continuous sign language recognition\nand translation. In our procedure, we have incorporated optical flow\ninformation with RGB images to enrich the features with movement-related\ninformation. This work studies the feasibility of such modality inclusion using\na cross-modal encoder. The plugin we have used is very lightweight and doesn't\nneed to include a separate feature extractor for the new modality in an\nend-to-end manner. We have applied the changes in both sign language\nrecognition and translation, improving the result in each case. We have\nevaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language\nrecognition and the RWTH-PHOENIX-2014T dataset for translation. On the\nrecognition task, our approach reduced the WER by 0.9, and on the translation\ntask, our approach increased most of the BLEU scores by ~0.6 on the test set.\n","authors":["Zaber Ibn Abdul Hakim","Rasman Mubtasim Swargo","Muhammad Abdullah Adnan"],"pdf_url":"https://arxiv.org/pdf/2309.01860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14214v4","updated":"2023-09-04T22:48:48Z","published":"2022-12-29T08:23:39Z","title":"Backward Curriculum Reinforcement Learning","summary":" Current reinforcement learning algorithms train an agent using\nforward-generated trajectories, which provide little guidance so that the agent\ncan explore as much as possible. While realizing the value of reinforcement\nlearning results from sufficient exploration, this approach leads to a\ntrade-off in losing sample efficiency, an essential factor impacting algorithm\nperformance. Previous tasks use reward-shaping techniques and network structure\nmodification to increase sample efficiency. However, these methods require many\nsteps to implement. In this work, we propose novel backward curriculum\nreinforcement learning that begins training the agent using the backward\ntrajectory of the episode instead of the original forward trajectory. This\napproach provides the agent with a strong reward signal, enabling more\nsample-efficient learning. Moreover, our method only requires a minor change in\nthe algorithm of reversing the order of the trajectory before agent training,\nallowing a straightforward application to any state-of-the-art algorithm.\n","authors":["KyungMin Ko"],"pdf_url":"https://arxiv.org/pdf/2212.14214v4.pdf","comment":"In the proceedings of the 32nd IEEE International Conference on Robot\n and Human Interactive Communication (IEEE RO-MAN 2023)"},{"id":"http://arxiv.org/abs/2309.01838v1","updated":"2023-09-04T22:25:49Z","published":"2023-09-04T22:25:49Z","title":"Efficient Defense Against Model Stealing Attacks on Convolutional Neural\n Networks","summary":" Model stealing attacks have become a serious concern for deep learning\nmodels, where an attacker can steal a trained model by querying its black-box\nAPI. This can lead to intellectual property theft and other security and\nprivacy risks. The current state-of-the-art defenses against model stealing\nattacks suggest adding perturbations to the prediction probabilities. However,\nthey suffer from heavy computations and make impracticable assumptions about\nthe adversary. They often require the training of auxiliary models. This can be\ntime-consuming and resource-intensive which hinders the deployment of these\ndefenses in real-world applications. In this paper, we propose a simple yet\neffective and efficient defense alternative. We introduce a heuristic approach\nto perturb the output probabilities. The proposed defense can be easily\nintegrated into models without additional training. We show that our defense is\neffective in defending against three state-of-the-art stealing attacks. We\nevaluate our approach on large and quantized (i.e., compressed) Convolutional\nNeural Networks (CNNs) trained on several vision datasets. Our technique\noutperforms the state-of-the-art defenses with a $\\times37$ faster inference\nlatency without requiring any additional model and with a low impact on the\nmodel's performance. We validate that our defense is also effective for\nquantized CNNs targeting edge devices.\n","authors":["Kacem Khaled","Mouna Dhaouadi","Felipe Gohring de Magalhães","Gabriela Nicolescu"],"pdf_url":"https://arxiv.org/pdf/2309.01838v1.pdf","comment":"Accepted for publication at 2023 International Conference on Machine\n Learning and Applications (ICMLA)"},{"id":"http://arxiv.org/abs/2309.01837v1","updated":"2023-09-04T22:16:35Z","published":"2023-09-04T22:16:35Z","title":"Delegating Data Collection in Decentralized Machine Learning","summary":" Motivated by the emergence of decentralized machine learning ecosystems, we\nstudy the delegation of data collection. Taking the field of contract theory as\nour starting point, we design optimal and near-optimal contracts that deal with\ntwo fundamental machine learning challenges: lack of certainty in the\nassessment of model quality and lack of knowledge regarding the optimal\nperformance of any model. We show that lack of certainty can be dealt with via\nsimple linear contracts that achieve 1-1/e fraction of the first-best utility,\neven if the principal has a small test set. Furthermore, we give sufficient\nconditions on the size of the principal's test set that achieves a vanishing\nadditive approximation to the optimal utility. To address the lack of a priori\nknowledge regarding the optimal performance, we give a convex program that can\nadaptively and efficiently compute the optimal contract.\n","authors":["Nivasini Ananthakrishnan","Stephen Bates","Michael I. Jordan","Nika Haghtalab"],"pdf_url":"https://arxiv.org/pdf/2309.01837v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13570v4","updated":"2023-09-04T22:00:31Z","published":"2023-08-25T05:52:41Z","title":"Stochastic Configuration Machines for Industrial Artificial Intelligence","summary":" Real-time predictive modelling with desired accuracy is highly expected in\nindustrial artificial intelligence (IAI), where neural networks play a key\nrole. Neural networks in IAI require powerful, high-performance computing\ndevices to operate a large number of floating point data. Based on stochastic\nconfiguration networks (SCNs), this paper proposes a new randomized learner\nmodel, termed stochastic configuration machines (SCMs), to stress effective\nmodelling and data size saving that are useful and valuable for industrial\napplications. Compared to SCNs and random vector functional-link (RVFL) nets\nwith binarized implementation, the model storage of SCMs can be significantly\ncompressed while retaining favourable prediction performance. Besides the\narchitecture of the SCM learner model and its learning algorithm, as an\nimportant part of this contribution, we also provide a theoretical basis on the\nlearning capacity of SCMs by analysing the model's complexity. Experimental\nstudies are carried out over some benchmark datasets and three industrial\napplications. The results demonstrate that SCM has great potential for dealing\nwith industrial data analytics.\n","authors":["Dianhui Wang","Matthew J. Felicetti"],"pdf_url":"https://arxiv.org/pdf/2308.13570v4.pdf","comment":"23 pages, 7 figures, 12 tables"},{"id":"http://arxiv.org/abs/2309.01829v1","updated":"2023-09-04T21:46:24Z","published":"2023-09-04T21:46:24Z","title":"Soft-Dropout: A Practical Approach for Mitigating Overfitting in Quantum\n Convolutional Neural Networks","summary":" Quantum convolutional neural network (QCNN), an early application for quantum\ncomputers in the NISQ era, has been consistently proven successful as a machine\nlearning (ML) algorithm for several tasks with significant accuracy. Derived\nfrom its classical counterpart, QCNN is prone to overfitting. Overfitting is a\ntypical shortcoming of ML models that are trained too closely to the availed\ntraining dataset and perform relatively poorly on unseen datasets for a similar\nproblem. In this work we study the adaptation of one of the most successful\noverfitting mitigation method, knows as the (post-training) dropout method, to\nthe quantum setting. We find that a straightforward implementation of this\nmethod in the quantum setting leads to a significant and undesirable\nconsequence: a substantial decrease in success probability of the QCNN. We\nargue that this effect exposes the crucial role of entanglement in QCNNs and\nthe vulnerability of QCNNs to entanglement loss. To handle overfitting, we\nproposed a softer version of the dropout method. We find that the proposed\nmethod allows us to handle successfully overfitting in the test cases.\n","authors":["Aakash Ravindra Shinde","Charu Jain","Amir Kalev"],"pdf_url":"https://arxiv.org/pdf/2309.01829v1.pdf","comment":"9 pages, 14 images, 6 tables"},{"id":"http://arxiv.org/abs/2309.01828v1","updated":"2023-09-04T21:36:46Z","published":"2023-09-04T21:36:46Z","title":"Secure and Efficient Federated Learning in LEO Constellations using\n Decentralized Key Generation and On-Orbit Model Aggregation","summary":" Satellite technologies have advanced drastically in recent years, leading to\na heated interest in launching small satellites into low Earth orbit (LEOs) to\ncollect massive data such as satellite imagery. Downloading these data to a\nground station (GS) to perform centralized learning to build an AI model is not\npractical due to the limited and expensive bandwidth. Federated learning (FL)\noffers a potential solution but will incur a very large convergence delay due\nto the highly sporadic and irregular connectivity between LEO satellites and\nGS. In addition, there are significant security and privacy risks where\neavesdroppers or curious servers/satellites may infer raw data from satellites'\nmodel parameters transmitted over insecure communication channels. To address\nthese issues, this paper proposes FedSecure, a secure FL approach designed for\nLEO constellations, which consists of two novel components: (1) decentralized\nkey generation that protects satellite data privacy using a functional\nencryption scheme, and (2) on-orbit model forwarding and aggregation that\ngenerates a partial global model per orbit to minimize the idle waiting time\nfor invisible satellites to enter the visible zone of the GS. Our analysis and\nresults show that FedSecure preserves the privacy of each satellite's data\nagainst eavesdroppers, a curious server, or curious satellites. It is\nlightweight with significantly lower communication and computation overheads\nthan other privacy-preserving FL aggregation approaches. It also reduces\nconvergence delay drastically from days to only a few hours, yet achieving high\naccuracy of up to 85.35% using realistic satellite images.\n","authors":["Mohamed Elmahallawy","Tie Luo","Mohamed I. Ibrahem"],"pdf_url":"https://arxiv.org/pdf/2309.01828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01825v1","updated":"2023-09-04T21:30:15Z","published":"2023-09-04T21:30:15Z","title":"LoopTune: Optimizing Tensor Computations with Reinforcement Learning","summary":" Advanced compiler technology is crucial for enabling machine learning\napplications to run on novel hardware, but traditional compilers fail to\ndeliver performance, popular auto-tuners have long search times and\nexpert-optimized libraries introduce unsustainable costs. To address this, we\ndeveloped LoopTune, a deep reinforcement learning compiler that optimizes\ntensor computations in deep learning models for the CPU. LoopTune optimizes\ntensor traversal order while using the ultra-fast lightweight code generator\nLoopNest to perform hardware-specific optimizations. With a novel graph-based\nrepresentation and action space, LoopTune speeds up LoopNest by 3.2x,\ngenerating an order of magnitude faster code than TVM, 2.8x faster than\nMetaSchedule, and 1.08x faster than AutoTVM, consistently performing at the\nlevel of the hand-tuned library Numpy. Moreover, LoopTune tunes code in order\nof seconds.\n","authors":["Dejan Grubisic","Bram Wasti","Chris Cummins","John Mellor-Crummey","Aleksandar Zlateski"],"pdf_url":"https://arxiv.org/pdf/2309.01825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01824v1","updated":"2023-09-04T21:26:26Z","published":"2023-09-04T21:26:26Z","title":"On the fly Deep Neural Network Optimization Control for Low-Power\n Computer Vision","summary":" Processing visual data on mobile devices has many applications, e.g.,\nemergency response and tracking. State-of-the-art computer vision techniques\nrely on large Deep Neural Networks (DNNs) that are usually too power-hungry to\nbe deployed on resource-constrained edge devices. Many techniques improve the\nefficiency of DNNs by using sparsity or quantization. However, the accuracy and\nefficiency of these techniques cannot be adapted for diverse edge applications\nwith different hardware constraints and accuracy requirements. This paper\npresents a novel technique to allow DNNs to adapt their accuracy and energy\nconsumption during run-time, without the need for any re-training. Our\ntechnique called AdaptiveActivation introduces a hyper-parameter that controls\nthe output range of the DNNs' activation function to dynamically adjust the\nsparsity and precision in the DNN. AdaptiveActivation can be applied to any\nexisting pre-trained DNN to improve their deployability in diverse edge\nenvironments. We conduct experiments on popular edge devices and show that the\naccuracy is within 1.5% of the baseline. We also show that our approach\nrequires 10%--38% less memory than the baseline techniques leading to more\naccuracy-efficiency tradeoff options\n","authors":["Ishmeet Kaur","Adwaita Janardhan Jadhav"],"pdf_url":"https://arxiv.org/pdf/2309.01824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01816v1","updated":"2023-09-04T21:10:45Z","published":"2023-09-04T21:10:45Z","title":"Computation and Communication Efficient Federated Learning over Wireless\n Networks","summary":" Federated learning (FL) allows model training from local data by edge devices\nwhile preserving data privacy. However, the learning accuracy decreases due to\nthe heterogeneity of devices data, and the computation and communication\nlatency increase when updating large scale learning models on devices with\nlimited computational capability and wireless resources. To overcome these\nchallenges, we consider a novel FL framework with partial model pruning and\npersonalization. This framework splits the learning model into a global part\nwith model pruning shared with all devices to learn data representations and a\npersonalized part to be fine tuned for a specific device, which adapts the\nmodel size during FL to reduce both computation and communication overhead and\nminimize the overall training time, and increases the learning accuracy for the\ndevice with non independent and identically distributed (non IID) data. Then,\nthe computation and communication latency and the convergence analysis of the\nproposed FL framework are mathematically analyzed. Based on the convergence\nanalysis, an optimization problem is formulated to maximize the convergence\nrate under a latency threshold by jointly optimizing the pruning ratio and\nwireless resource allocation. By decoupling the optimization problem and\ndeploying Karush Kuhn Tucker (KKT) conditions, we derive the closed form\nsolutions of pruning ratio and wireless resource allocation. Finally,\nexperimental results demonstrate that the proposed FL framework achieves a\nremarkable reduction of approximately 50 percents computation and communication\nlatency compared with the scheme only with model personalization.\n","authors":["Xiaonan Liu","Tharmalingam Ratnarajah"],"pdf_url":"https://arxiv.org/pdf/2309.01816v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.09042"},{"id":"http://arxiv.org/abs/2309.01808v1","updated":"2023-09-04T20:52:33Z","published":"2023-09-04T20:52:33Z","title":"DiscoverPath: A Knowledge Refinement and Retrieval System for\n Interdisciplinarity on Biomedical Research","summary":" The exponential growth in scholarly publications necessitates advanced tools\nfor efficient article retrieval, especially in interdisciplinary fields where\ndiverse terminologies are used to describe similar research. Traditional\nkeyword-based search engines often fall short in assisting users who may not be\nfamiliar with specific terminologies. To address this, we present a knowledge\ngraph-based paper search engine for biomedical research to enhance the user\nexperience in discovering relevant queries and articles. The system, dubbed\nDiscoverPath, employs Named Entity Recognition (NER) and part-of-speech (POS)\ntagging to extract terminologies and relationships from article abstracts to\ncreate a KG. To reduce information overload, DiscoverPath presents users with a\nfocused subgraph containing the queried entity and its neighboring nodes and\nincorporates a query recommendation system, enabling users to iteratively\nrefine their queries. The system is equipped with an accessible Graphical User\nInterface that provides an intuitive visualization of the KG, query\nrecommendations, and detailed article information, enabling efficient article\nretrieval, thus fostering interdisciplinary knowledge exploration. DiscoverPath\nis open-sourced at https://github.com/ynchuang/DiscoverPath.\n","authors":["Yu-Neng Chuang","Guanchu Wang","Chia-Yuan Chang","Kwei-Herng Lai","Daochen Zha","Ruixiang Tang","Fan Yang","Alfredo Costilla Reyes","Kaixiong Zhou","Xiaoqian Jiang","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2309.01808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01807v1","updated":"2023-09-04T20:52:04Z","published":"2023-09-04T20:52:04Z","title":"Marginalized Importance Sampling for Off-Environment Policy Evaluation","summary":" Reinforcement Learning (RL) methods are typically sample-inefficient, making\nit challenging to train and deploy RL-policies in real world robots. Even a\nrobust policy trained in simulation, requires a real-world deployment to assess\ntheir performance. This paper proposes a new approach to evaluate the\nreal-world performance of agent policies without deploying them in the real\nworld. The proposed approach incorporates a simulator along with real-world\noffline data to evaluate the performance of any policy using the framework of\nMarginalized Importance Sampling (MIS). Existing MIS methods face two\nchallenges: (1) large density ratios that deviate from a reasonable range and\n(2) indirect supervision, where the ratio needs to be inferred indirectly, thus\nexacerbating estimation error. Our approach addresses these challenges by\nintroducing the target policy's occupancy in the simulator as an intermediate\nvariable and learning the density ratio as the product of two terms that can be\nlearned separately. The first term is learned with direct supervision and the\nsecond term has a small magnitude, thus making it easier to run. We analyze the\nsample complexity as well as error propagation of our two step-procedure.\nFurthermore, we empirically evaluate our approach on Sim2Sim environments such\nas Cartpole, Reacher and Half-Cheetah. Our results show that our method\ngeneralizes well across a variety of Sim2Sim gap, target policies and offline\ndata collection policies. We also demonstrate the performance of our algorithm\non a Sim2Real task of validating the performance of a 7 DOF robotic arm using\noffline data along with a gazebo based arm simulator.\n","authors":["Pulkit Katdare","Nan Jiang","Katherine Driggs-Campbell"],"pdf_url":"https://arxiv.org/pdf/2309.01807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01796v1","updated":"2023-09-04T20:23:35Z","published":"2023-09-04T20:23:35Z","title":"Asymmetric matrix sensing by gradient descent with small random\n initialization","summary":" We study matrix sensing, which is the problem of reconstructing a low-rank\nmatrix from a few linear measurements. It can be formulated as an\noverparameterized regression problem, which can be solved by factorized\ngradient descent when starting from a small random initialization.\n Linear neural networks, and in particular matrix sensing by factorized\ngradient descent, serve as prototypical models of non-convex problems in modern\nmachine learning, where complex phenomena can be disentangled and studied in\ndetail. Much research has been devoted to studying special cases of asymmetric\nmatrix sensing, such as asymmetric matrix factorization and symmetric positive\nsemi-definite matrix sensing.\n Our key contribution is introducing a continuous differential equation that\nwe call the $\\textit{perturbed gradient flow}$. We prove that the perturbed\ngradient flow converges quickly to the true target matrix whenever the\nperturbation is sufficiently bounded. The dynamics of gradient descent for\nmatrix sensing can be reduced to this formulation, yielding a novel proof of\nasymmetric matrix sensing with factorized gradient descent. Compared to\ndirectly analyzing the dynamics of gradient descent, the continuous formulation\nallows bounding key quantities by considering their derivatives, often\nsimplifying the proofs. We believe the general proof technique may prove useful\nin other settings as well.\n","authors":["Johan S. Wind"],"pdf_url":"https://arxiv.org/pdf/2309.01796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01795v1","updated":"2023-09-04T20:22:57Z","published":"2023-09-04T20:22:57Z","title":"Composite federated learning with heterogeneous data","summary":" We propose a novel algorithm for solving the composite Federated Learning\n(FL) problem. This algorithm manages non-smooth regularization by strategically\ndecoupling the proximal operator and communication, and addresses client drift\nwithout any assumptions about data similarity. Moreover, each worker uses local\nupdates to reduce the communication frequency with the server and transmits\nonly a $d$-dimensional vector per communication round. We prove that our\nalgorithm converges linearly to a neighborhood of the optimal solution and\ndemonstrate the superiority of our algorithm over state-of-the-art methods in\nnumerical experiments.\n","authors":["Jiaojiao Zhang","Jiang Hu","Mikael Johansson"],"pdf_url":"https://arxiv.org/pdf/2309.01795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.11777v2","updated":"2023-09-04T20:15:54Z","published":"2022-11-18T20:39:51Z","title":"Dataset of Pathloss and ToA Radio Maps With Localization Application","summary":" In this article, we present a collection of radio map datasets in dense urban\nsetting, which we generated and made publicly available. The datasets include\nsimulated pathloss/received signal strength (RSS) and time of arrival (ToA)\nradio maps over a large collection of realistic dense urban setting in real\ncity maps. The two main applications of the presented dataset are 1) learning\nmethods that predict the pathloss from input city maps (namely, deep\nlearning-based simulations), and, 2) wireless localization. The fact that the\nRSS and ToA maps are computed by the same simulations over the same city maps\nallows for a fair comparison of the RSS and ToA-based localization methods.\n","authors":["Çağkan Yapar","Ron Levie","Gitta Kutyniok","Giuseppe Caire"],"pdf_url":"https://arxiv.org/pdf/2212.11777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.01567v4","updated":"2023-09-04T20:15:15Z","published":"2021-02-02T15:48:19Z","title":"A Lyapunov Theory for Finite-Sample Guarantees of Asynchronous\n Q-Learning and TD-Learning Variants","summary":" This paper develops an unified framework to study finite-sample convergence\nguarantees of a large class of value-based asynchronous reinforcement learning\n(RL) algorithms. We do this by first reformulating the RL algorithms as\n\\textit{Markovian Stochastic Approximation} (SA) algorithms to solve\nfixed-point equations. We then develop a Lyapunov analysis and derive\nmean-square error bounds on the convergence of the Markovian SA. Based on this\nresult, we establish finite-sample mean-square convergence bounds for\nasynchronous RL algorithms such as $Q$-learning, $n$-step TD, TD$(\\lambda)$,\nand off-policy TD algorithms including V-trace. As a by-product, by analyzing\nthe convergence bounds of $n$-step TD and TD$(\\lambda)$, we provide theoretical\ninsights into the bias-variance trade-off, i.e., efficiency of bootstrapping in\nRL. This was first posed as an open problem in (Sutton, 1999).\n","authors":["Zaiwei Chen","Siva Theja Maguluri","Sanjay Shakkottai","Karthikeyan Shanmugam"],"pdf_url":"https://arxiv.org/pdf/2102.01567v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01788v1","updated":"2023-09-04T19:59:51Z","published":"2023-09-04T19:59:51Z","title":"Hierarchical Grammar-Induced Geometry for Data-Efficient Molecular\n Property Prediction","summary":" The prediction of molecular properties is a crucial task in the field of\nmaterial and drug discovery. The potential benefits of using deep learning\ntechniques are reflected in the wealth of recent literature. Still, these\ntechniques are faced with a common challenge in practice: Labeled data are\nlimited by the cost of manual extraction from literature and laborious\nexperimentation. In this work, we propose a data-efficient property predictor\nby utilizing a learnable hierarchical molecular grammar that can generate\nmolecules from grammar production rules. Such a grammar induces an explicit\ngeometry of the space of molecular graphs, which provides an informative prior\non molecular structural similarity. The property prediction is performed using\ngraph neural diffusion over the grammar-induced geometry. On both small and\nlarge datasets, our evaluation shows that this approach outperforms a wide\nspectrum of baselines, including supervised and pre-trained graph neural\nnetworks. We include a detailed ablation study and further analysis of our\nsolution, showing its effectiveness in cases with extremely limited data. Code\nis available at https://github.com/gmh14/Geo-DEG.\n","authors":["Minghao Guo","Veronika Thost","Samuel W Song","Adithya Balachandran","Payel Das","Jie Chen","Wojciech Matusik"],"pdf_url":"https://arxiv.org/pdf/2309.01788v1.pdf","comment":"22 pages, 10 figures; ICML 2023"},{"id":"http://arxiv.org/abs/2309.01784v1","updated":"2023-09-04T19:56:18Z","published":"2023-09-04T19:56:18Z","title":"ATMS: Algorithmic Trading-Guided Market Simulation","summary":" The effective construction of an Algorithmic Trading (AT) strategy often\nrelies on market simulators, which remains challenging due to existing methods'\ninability to adapt to the sequential and dynamic nature of trading activities.\nThis work fills this gap by proposing a metric to quantify market discrepancy.\nThis metric measures the difference between a causal effect from underlying\nmarket unique characteristics and it is evaluated through the interaction\nbetween the AT agent and the market. Most importantly, we introduce Algorithmic\nTrading-guided Market Simulation (ATMS) by optimizing our proposed metric.\nInspired by SeqGAN, ATMS formulates the simulator as a stochastic policy in\nreinforcement learning (RL) to account for the sequential nature of trading.\nMoreover, ATMS utilizes the policy gradient update to bypass differentiating\nthe proposed metric, which involves non-differentiable operations such as order\ndeletion from the market. Through extensive experiments on semi-real market\ndata, we demonstrate the effectiveness of our metric and show that ATMS\ngenerates market data with improved similarity to reality compared to the\nstate-of-the-art conditional Wasserstein Generative Adversarial Network (cWGAN)\napproach. Furthermore, ATMS produces market data with more balanced BUY and\nSELL volumes, mitigating the bias of the cWGAN baseline approach, where a\nsimple strategy can exploit the BUY/SELL imbalance for profit.\n","authors":["Song Wei","Andrea Coletta","Svitlana Vyetrenko","Tucker Balch"],"pdf_url":"https://arxiv.org/pdf/2309.01784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01783v1","updated":"2023-09-04T19:48:56Z","published":"2023-09-04T19:48:56Z","title":"Survival Prediction from Imbalance colorectal cancer dataset using\n hybrid sampling methods and tree-based classifiers","summary":" Background and Objective: Colorectal cancer is a high mortality cancer.\nClinical data analysis plays a crucial role in predicting the survival of\ncolorectal cancer patients, enabling clinicians to make informed treatment\ndecisions. However, utilizing clinical data can be challenging, especially when\ndealing with imbalanced outcomes. This paper focuses on developing algorithms\nto predict 1-, 3-, and 5-year survival of colorectal cancer patients using\nclinical datasets, with particular emphasis on the highly imbalanced 1-year\nsurvival prediction task. To address this issue, we propose a method that\ncreates a pipeline of some of standard balancing techniques to increase the\ntrue positive rate. Evaluation is conducted on a colorectal cancer dataset from\nthe SEER database. Methods: The pre-processing step consists of removing\nrecords with missing values and merging categories. The minority class of\n1-year and 3-year survival tasks consists of 10% and 20% of the data,\nrespectively. Edited Nearest Neighbor, Repeated edited nearest neighbor (RENN),\nSynthetic Minority Over-sampling Techniques (SMOTE), and pipelines of SMOTE and\nRENN approaches were used and compared for balancing the data with tree-based\nclassifiers. Decision Trees, Random Forest, Extra Tree, eXtreme Gradient\nBoosting, and Light Gradient Boosting (LGBM) are used in this article. Method.\nResults: The performance evaluation utilizes a 5-fold cross-validation\napproach. In the case of highly imbalanced datasets (1-year), our proposed\nmethod with LGBM outperforms other sampling methods with the sensitivity of\n72.30%. For the task of imbalance (3-year survival), the combination of RENN\nand LGBM achieves a sensitivity of 80.81%, indicating that our proposed method\nworks best for highly imbalanced datasets. Conclusions: Our proposed method\nsignificantly improves mortality prediction for the minority class of\ncolorectal cancer patients.\n","authors":["Sadegh Soleimani","Mahsa Bahrami","Mansour Vali"],"pdf_url":"https://arxiv.org/pdf/2309.01783v1.pdf","comment":"19 Pages, 6 Figures, 4 Tables"},{"id":"http://arxiv.org/abs/2309.01782v1","updated":"2023-09-04T19:48:17Z","published":"2023-09-04T19:48:17Z","title":"3D View Prediction Models of the Dorsal Visual Stream","summary":" Deep neural network representations align well with brain activity in the\nventral visual stream. However, the primate visual system has a distinct dorsal\nprocessing stream with different functional properties. To test if a model\ntrained to perceive 3D scene geometry aligns better with neural responses in\ndorsal visual areas, we trained a self-supervised geometry-aware recurrent\nneural network (GRNN) to predict novel camera views using a 3D feature memory.\nWe compared GRNN to self-supervised baseline models that have been shown to\nalign well with ventral regions using the large-scale fMRI Natural Scenes\nDataset (NSD). We found that while the baseline models accounted better for\nventral brain regions, GRNN accounted for a greater proportion of variance in\ndorsal brain regions. Our findings demonstrate the potential for using\ntask-relevant models to probe representational differences across visual\nstreams.\n","authors":["Gabriel Sarch","Hsiao-Yu Fish Tung","Aria Wang","Jacob Prince","Michael Tarr"],"pdf_url":"https://arxiv.org/pdf/2309.01782v1.pdf","comment":"2023 Conference on Cognitive Computational Neuroscience"},{"id":"http://arxiv.org/abs/2309.01781v1","updated":"2023-09-04T19:47:04Z","published":"2023-09-04T19:47:04Z","title":"Self-concordant Smoothing for Convex Composite Optimization","summary":" We introduce the notion of self-concordant smoothing for minimizing the sum\nof two convex functions: the first is smooth and the second may be nonsmooth.\nOur framework results naturally from the smoothing approximation technique\nreferred to as partial smoothing in which only a part of the nonsmooth function\nis smoothed. The key highlight of our approach is in a natural property of the\nresulting problem's structure which provides us with a variable-metric\nselection method and a step-length selection rule particularly suitable for\nproximal Newton-type algorithms. In addition, we efficiently handle specific\nstructures promoted by the nonsmooth function, such as $\\ell_1$-regularization\nand group-lasso penalties. We prove local quadratic convergence rates for two\nresulting algorithms: Prox-N-SCORE, a proximal Newton algorithm and\nProx-GGN-SCORE, a proximal generalized Gauss-Newton (GGN) algorithm. The\nProx-GGN-SCORE algorithm highlights an important approximation procedure which\nhelps to significantly reduce most of the computational overhead associated\nwith the inverse Hessian. This approximation is essentially useful for\noverparameterized machine learning models and in the mini-batch settings.\nNumerical examples on both synthetic and real datasets demonstrate the\nefficiency of our approach and its superiority over existing approaches.\n","authors":["Adeyemi D. Adeoye","Alberto Bemporad"],"pdf_url":"https://arxiv.org/pdf/2309.01781v1.pdf","comment":"37 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.01780v1","updated":"2023-09-04T19:45:18Z","published":"2023-09-04T19:45:18Z","title":"Measuring, Interpreting, and Improving Fairness of Algorithms using\n Causal Inference and Randomized Experiments","summary":" Algorithm fairness has become a central problem for the broad adoption of\nartificial intelligence. Although the past decade has witnessed an explosion of\nexcellent work studying algorithm biases, achieving fairness in real-world AI\nproduction systems has remained a challenging task. Most existing works fail to\nexcel in practical applications since either they have conflicting measurement\ntechniques and/ or heavy assumptions, or require code-access of the production\nmodels, whereas real systems demand an easy-to-implement measurement framework\nand a systematic way to correct the detected sources of bias.\n In this paper, we leverage recent advances in causal inference and\ninterpretable machine learning to present an algorithm-agnostic framework\n(MIIF) to Measure, Interpret, and Improve the Fairness of an algorithmic\ndecision. We measure the algorithm bias using randomized experiments, which\nenables the simultaneous measurement of disparate treatment, disparate impact,\nand economic value. Furthermore, using modern interpretability techniques, we\ndevelop an explainable machine learning model which accurately interprets and\ndistills the beliefs of a blackbox algorithm. Altogether, these techniques\ncreate a simple and powerful toolset for studying algorithm fairness,\nespecially for understanding the cost of fairness in practical applications\nlike e-commerce and targeted advertising, where industry A/B testing is already\nabundant.\n","authors":["James Enouen","Tianshu Sun","Yan Liu"],"pdf_url":"https://arxiv.org/pdf/2309.01780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01779v1","updated":"2023-09-04T19:40:58Z","published":"2023-09-04T19:40:58Z","title":"DRAG: Divergence-based Adaptive Aggregation in Federated learning on\n Non-IID Data","summary":" Local stochastic gradient descent (SGD) is a fundamental approach in\nachieving communication efficiency in Federated Learning (FL) by allowing\nindividual workers to perform local updates. However, the presence of\nheterogeneous data distributions across working nodes causes each worker to\nupdate its local model towards a local optimum, leading to the phenomenon known\nas ``client-drift\" and resulting in slowed convergence. To address this issue,\nprevious works have explored methods that either introduce communication\noverhead or suffer from unsteady performance. In this work, we introduce a\nnovel metric called ``degree of divergence,\" quantifying the angle between the\nlocal gradient and the global reference direction. Leveraging this metric, we\npropose the divergence-based adaptive aggregation (DRAG) algorithm, which\ndynamically ``drags\" the received local updates toward the reference direction\nin each round without requiring extra communication overhead. Furthermore, we\nestablish a rigorous convergence analysis for DRAG, proving its ability to\nachieve a sublinear convergence rate. Compelling experimental results are\npresented to illustrate DRAG's superior performance compared to\nstate-of-the-art algorithms in effectively managing the client-drift\nphenomenon. Additionally, DRAG exhibits remarkable resilience against certain\nByzantine attacks. By securely sharing a small sample of the client's data with\nthe FL server, DRAG effectively counters these attacks, as demonstrated through\ncomprehensive experiments.\n","authors":["Feng Zhu","Jingjing Zhang","Shengyun Liu","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2309.01779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01778v1","updated":"2023-09-04T19:39:21Z","published":"2023-09-04T19:39:21Z","title":"CONFIDERAI: a novel CONFormal Interpretable-by-Design score function\n forExplainable and Reliable Artificial Intelligence","summary":" Everyday life is increasingly influenced by artificial intelligence, and\nthere is no question that machine learning algorithms must be designed to be\nreliable and trustworthy for everyone. Specifically, computer scientists\nconsider an artificial intelligence system safe and trustworthy if it fulfills\nfive pillars: explainability, robustness, transparency, fairness, and privacy.\nIn addition to these five, we propose a sixth fundamental aspect: conformity,\nthat is, the probabilistic assurance that the system will behave as the machine\nlearner expects. In this paper, we propose a methodology to link conformal\nprediction with explainable machine learning by defining CONFIDERAI, a new\nscore function for rule-based models that leverages both rules predictive\nability and points geometrical position within rules boundaries. We also\naddress the problem of defining regions in the feature space where conformal\nguarantees are satisfied by exploiting techniques to control the number of\nnon-conformal samples in conformal regions based on support vector data\ndescription (SVDD). The overall methodology is tested with promising results on\nbenchmark and real datasets, such as DNS tunneling detection or cardiovascular\ndisease prediction.\n","authors":["Alberto Carlevaro","Sara Narteni","Fabrizio Dabbene","Marco Muselli","Maurizio Mongelli"],"pdf_url":"https://arxiv.org/pdf/2309.01778v1.pdf","comment":"12 pages, 7 figures, 1 algorithm, international journal"},{"id":"http://arxiv.org/abs/2309.01775v1","updated":"2023-09-04T19:28:54Z","published":"2023-09-04T19:28:54Z","title":"Gated recurrent neural networks discover attention","summary":" Recent architectural developments have enabled recurrent neural networks\n(RNNs) to reach and even surpass the performance of Transformers on certain\nsequence modeling tasks. These modern RNNs feature a prominent design pattern:\nlinear recurrent layers interconnected by feedforward paths with multiplicative\ngating. Here, we show how RNNs equipped with these two design elements can\nexactly implement (linear) self-attention, the main building block of\nTransformers. By reverse-engineering a set of trained RNNs, we find that\ngradient descent in practice discovers our construction. In particular, we\nexamine RNNs trained to solve simple in-context learning tasks on which\nTransformers are known to excel and find that gradient descent instills in our\nRNNs the same attention-based in-context learning algorithm used by\nTransformers. Our findings highlight the importance of multiplicative\ninteractions in neural networks and suggest that certain RNNs might be\nunexpectedly implementing attention under the hood.\n","authors":["Nicolas Zucchet","Seijin Kobayashi","Yassir Akram","Johannes von Oswald","Maxime Larcher","Angelika Steger","João Sacramento"],"pdf_url":"https://arxiv.org/pdf/2309.01775v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.01516v1","updated":"2023-09-04T10:48:29Z","published":"2023-09-04T10:48:29Z","title":"MultiWay-Adapater: Adapting large-scale multi-modal models for scalable\n image-text retrieval","summary":" As the size of Large Multi-Modal Models (LMMs) increases consistently, the\nadaptation of these pre-trained models to specialized tasks has become a\ncomputationally and memory-intensive challenge. Traditional fine-tuning methods\nrequire isolated, exhaustive retuning for each new task, limiting the models'\nversatility. Moreover, current efficient adaptation techniques often overlook\nmodality alignment, focusing only on the knowledge extraction of new tasks. To\ntackle these issues, we introduce Multiway-Adapter, an innovative framework\nincorporating an 'Alignment Enhancer' to deepen modality alignment, enabling\nhigh transferability without tuning pre-trained parameters. Our method adds\nfewer than 1.25\\% of additional parameters to LMMs, exemplified by the BEiT-3\nmodel in our study. This leads to superior zero-shot image-text retrieval\nperformance compared to fully fine-tuned models, while achieving up to a 57\\%\nreduction in fine-tuning time. Our approach offers a resource-efficient and\neffective adaptation pathway for LMMs, broadening their applicability. The\nsource code is publicly available at:\n\\url{https://github.com/longkukuhi/MultiWay-Adapter}.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa"],"pdf_url":"https://arxiv.org/pdf/2309.01516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16215v2","updated":"2023-09-04T09:09:31Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control","summary":" Lossy video compression is commonly used when transmitting and storing video\ndata. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard,\ndespite the availability of advanced (neural) compression approaches.\nTransmitting videos in the face of dynamic network bandwidth conditions\nrequires video codecs to adapt to vastly different compression strengths. Rate\ncontrol modules augment the codec's compression such that bandwidth constraints\nare satisfied and video distortion is minimized. While, both standard video\ncodes and their rate control modules are developed to minimize video distortion\nw.r.t. human quality assessment, preserving the downstream performance of deep\nvision models is not considered. In this paper, we present the first end-to-end\nlearnable deep video codec control considering both bandwidth constraints and\ndownstream vision performance, while not breaking existing standardization. We\ndemonstrate for two common vision tasks (semantic segmentation and optical flow\nestimation) and on two different datasets that our deep codec control better\npreserves downstream performance than using 2-pass average bit rate control\nwhile meeting dynamic bandwidth constraints and adhering to standardizations.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v2.pdf","comment":"22 pages, 26 figures, 6 tables"},{"id":"http://arxiv.org/abs/2308.03826v2","updated":"2023-09-04T06:03:58Z","published":"2023-08-07T17:49:04Z","title":"Recurrent Multi-scale Transformer for High-Resolution Salient Object\n Detection","summary":" Salient Object Detection (SOD) aims to identify and segment the most\nconspicuous objects in an image or video. As an important pre-processing step,\nit has many potential applications in multimedia and vision tasks. With the\nadvance of imaging devices, SOD with high-resolution images is of great demand,\nrecently. However, traditional SOD methods are largely limited to\nlow-resolution images, making them difficult to adapt to the development of\nHigh-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no\nlarge enough datasets for training and evaluating. Besides, current HRSOD\nmethods generally produce incomplete object regions and irregular object\nboundaries. To address above issues, in this work, we first propose a new\nHRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K\nresolution. As far as we know, it is the largest dataset for the HRSOD task,\nwhich will significantly help future works in training and evaluating models.\nFurthermore, to improve the HRSOD performance, we propose a novel Recurrent\nMulti-scale Transformer (RMFormer), which recurrently utilizes shared\nTransformers and multi-scale refinement architectures. Thus, high-resolution\nsaliency maps can be generated with the guidance of lower-resolution\npredictions. Extensive experiments on both high-resolution and low-resolution\nbenchmarks show the effectiveness and superiority of the proposed framework.\nThe source code and dataset are released at:\nhttps://github.com/DrowsyMon/RMFormer.\n","authors":["Xinhao Deng","Pingping Zhang","Wei Liu","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.03826v2.pdf","comment":"This work is the camera-ready version of ACM MM2023"},{"id":"http://arxiv.org/abs/2309.01366v1","updated":"2023-09-04T05:26:28Z","published":"2023-09-04T05:26:28Z","title":"Target-Guided Composed Image Retrieval","summary":" Composed image retrieval (CIR) is a new and flexible image retrieval\nparadigm, which can retrieve the target image for a multimodal query, including\na reference image and its corresponding modification text. Although existing\nefforts have achieved compelling success, they overlook the conflict\nrelationship modeling between the reference image and the modification text for\nimproving the multimodal query composition and the adaptive matching degree\nmodeling for promoting the ranking of the candidate images that could present\ndifferent levels of matching degrees with the given query. To address these two\nlimitations, in this work, we propose a Target-Guided Composed Image Retrieval\nnetwork (TG-CIR). In particular, TG-CIR first extracts the unified global and\nlocal attribute features for the reference/target image and the modification\ntext with the contrastive language-image pre-training model (CLIP) as the\nbackbone, where an orthogonal regularization is introduced to promote the\nindependence among the attribute features. Then TG-CIR designs a target-query\nrelationship-guided multimodal query composition module, comprising a\ntarget-free student composition branch and a target-based teacher composition\nbranch, where the target-query relationship is injected into the teacher branch\nfor guiding the conflict relationship modeling of the student branch. Last,\napart from the conventional batch-based classification loss, TG-CIR\nadditionally introduces a batch-based target similarity-guided matching degree\nregularization to promote the metric learning process. Extensive experiments on\nthree benchmark datasets demonstrate the superiority of our proposed method.\n","authors":["Haokun Wen","Xian Zhang","Xuemeng Song","Yinwei Wei","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2309.01366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01339v1","updated":"2023-09-04T03:49:30Z","published":"2023-09-04T03:49:30Z","title":"UniSA: Unified Generative Framework for Sentiment Analysis","summary":" Sentiment analysis is a crucial task that aims to understand people's\nemotional states and predict emotional categories based on multimodal\ninformation. It consists of several subtasks, such as emotion recognition in\nconversation (ERC), aspect-based sentiment analysis (ABSA), and multimodal\nsentiment analysis (MSA). However, unifying all subtasks in sentiment analysis\npresents numerous challenges, including modality alignment, unified\ninput/output forms, and dataset bias. To address these challenges, we propose a\nTask-Specific Prompt method to jointly model subtasks and introduce a\nmultimodal generative framework called UniSA. Additionally, we organize the\nbenchmark datasets of main subtasks into a new Sentiment Analysis Evaluation\nbenchmark, SAEval. We design novel pre-training tasks and training methods to\nenable the model to learn generic sentiment knowledge among subtasks to improve\nthe model's multimodal sentiment perception ability. Our experimental results\nshow that UniSA performs comparably to the state-of-the-art on all subtasks and\ngeneralizes well to various subtasks in sentiment analysis.\n","authors":["Zaijing Li","Ting-En Lin","Yuchuan Wu","Meng Liu","Fengxiao Tang","Ming Zhao","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2309.01339v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.01327v1","updated":"2023-09-04T03:06:04Z","published":"2023-09-04T03:06:04Z","title":"Can I Trust Your Answer? Visually Grounded Video Question Answering","summary":" We study visually grounded VideoQA in response to the emerging trends of\nutilizing pretraining techniques for video-language understanding.\nSpecifically, by forcing vision-language models (VLMs) to answer questions and\nsimultaneously provide visual evidence, we seek to ascertain the extent to\nwhich the predictions of such techniques are genuinely anchored in relevant\nvideo content, versus spurious correlations from language or irrelevant visual\ncontext. Towards this, we construct NExT-GQA -- an extension of NExT-QA with\n10.5$K$ temporal grounding (or location) labels tied to the original QA pairs.\nWith NExT-GQA, we scrutinize a variety of state-of-the-art VLMs. Through\npost-hoc attention analysis, we find that these models are weak in\nsubstantiating the answers despite their strong QA performance. This exposes a\nsevere limitation of these models in making reliable predictions. As a remedy,\nwe further explore and suggest a video grounding mechanism via Gaussian mask\noptimization and cross-modal learning. Experiments with different backbones\ndemonstrate that this grounding mechanism improves both video grounding and QA.\nOur dataset and code are released. With these efforts, we aim to push towards\nthe reliability of deploying VLMs in VQA systems.\n","authors":["Junbin Xiao","Angela Yao","Yicong Li","Tat Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2309.01327v1.pdf","comment":"Preprint. Data and code: https://github.com/doc-doc/NExT-GQA"}]},"2023-09-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.01256v1","updated":"2023-09-03T19:45:02Z","published":"2023-09-03T19:45:02Z","title":"BDC-Adapter: Brownian Distance Covariance for Better Vision-Language\n Reasoning","summary":" Large-scale pre-trained Vision-Language Models (VLMs), such as CLIP and\nALIGN, have introduced a new paradigm for learning transferable visual\nrepresentations. Recently, there has been a surge of interest among researchers\nin developing lightweight fine-tuning techniques to adapt these models to\ndownstream visual tasks. We recognize that current state-of-the-art fine-tuning\nmethods, such as Tip-Adapter, simply consider the covariance between the query\nimage feature and features of support few-shot training samples, which only\ncaptures linear relations and potentially instigates a deceptive perception of\nindependence. To address this issue, in this work, we innovatively introduce\nBrownian Distance Covariance (BDC) to the field of vision-language reasoning.\nThe BDC metric can model all possible relations, providing a robust metric for\nmeasuring feature dependence. Based on this, we present a novel method called\nBDC-Adapter, which integrates BDC prototype similarity reasoning and\nmulti-modal reasoning network prediction to perform classification tasks. Our\nextensive experimental results show that the proposed BDC-Adapter can freely\nhandle non-linear relations and fully characterize independence, outperforming\nthe current state-of-the-art methods by large margins.\n","authors":["Yi Zhang","Ce Zhang","Zihan Liao","Yushun Tang","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2309.01256v1.pdf","comment":"Accepted by BMVC 2023"},{"id":"http://arxiv.org/abs/2309.01249v1","updated":"2023-09-03T19:24:34Z","published":"2023-09-03T19:24:34Z","title":"Large AI Model Empowered Multimodal Semantic Communications","summary":" Multimodal signals, including text, audio, image and video, can be integrated\ninto Semantic Communication (SC) for providing an immersive experience with low\nlatency and high quality at the semantic level. However, the multimodal SC has\nseveral challenges, including data heterogeneity, semantic ambiguity, and\nsignal fading. Recent advancements in large AI models, particularly in\nMultimodal Language Model (MLM) and Large Language Model (LLM), offer potential\nsolutions for these issues. To this end, we propose a Large AI Model-based\nMultimodal SC (LAM-MSC) framework, in which we first present the MLM-based\nMultimodal Alignment (MMA) that utilizes the MLM to enable the transformation\nbetween multimodal and unimodal data while preserving semantic consistency.\nThen, a personalized LLM-based Knowledge Base (LKB) is proposed, which allows\nusers to perform personalized semantic extraction or recovery through the LLM.\nThis effectively addresses the semantic ambiguity. Finally, we apply the\nConditional Generative adversarial networks-based channel Estimation (CGE) to\nobtain Channel State Information (CSI). This approach effectively mitigates the\nimpact of fading channels in SC. Finally, we conduct simulations that\ndemonstrate the superior performance of the LAM-MSC framework.\n","authors":["Feibo Jiang","Yubo Peng","Li Dong","Kezhi Wang","Kun Yang","Cunhua Pan","Xiaohu You"],"pdf_url":"https://arxiv.org/pdf/2309.01249v1.pdf","comment":"To be submitted for journal publication"},{"id":"http://arxiv.org/abs/2309.01245v1","updated":"2023-09-03T19:10:18Z","published":"2023-09-03T19:10:18Z","title":"Representations Matter: Embedding Modes of Large Language Models using\n Dynamic Mode Decomposition","summary":" Existing large language models (LLMs) are known for generating \"hallucinated\"\ncontent, namely a fabricated text of plausibly looking, yet unfounded, facts.\nTo identify when these hallucination scenarios occur, we examine the properties\nof the generated text in the embedding space. Specifically, we draw inspiration\nfrom the dynamic mode decomposition (DMD) tool in analyzing the pattern\nevolution of text embeddings across sentences. We empirically demonstrate how\nthe spectrum of sentence embeddings over paragraphs is constantly low-rank for\nthe generated text, unlike that of the ground-truth text. Importantly, we find\nthat evaluation cases having LLM hallucinations correspond to ground-truth\nembedding patterns with a higher number of modes being poorly approximated by\nthe few modes associated with LLM embedding patterns. In analogy to near-field\nelectromagnetic evanescent waves, the embedding DMD eigenmodes of the generated\ntext with hallucinations vanishes quickly across sentences as opposed to those\nof the ground-truth text. This suggests that the hallucinations result from\nboth the generation techniques and the underlying representation.\n","authors":["Mohamed Akrout"],"pdf_url":"https://arxiv.org/pdf/2309.01245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01219v1","updated":"2023-09-03T16:56:48Z","published":"2023-09-03T16:56:48Z","title":"Siren's Song in the AI Ocean: A Survey on Hallucination in Large\n Language Models","summary":" While large language models (LLMs) have demonstrated remarkable capabilities\nacross a range of downstream tasks, a significant concern revolves around their\npropensity to exhibit hallucinations: LLMs occasionally generate content that\ndiverges from the user input, contradicts previously generated context, or\nmisaligns with established world knowledge. This phenomenon poses a substantial\nchallenge to the reliability of LLMs in real-world scenarios. In this paper, we\nsurvey recent efforts on the detection, explanation, and mitigation of\nhallucination, with an emphasis on the unique challenges posed by LLMs. We\npresent taxonomies of the LLM hallucination phenomena and evaluation\nbenchmarks, analyze existing approaches aiming at mitigating LLM hallucination,\nand discuss potential directions for future research.\n","authors":["Yue Zhang","Yafu Li","Leyang Cui","Deng Cai","Lemao Liu","Tingchen Fu","Xinting Huang","Enbo Zhao","Yu Zhang","Yulong Chen","Longyue Wang","Anh Tuan Luu","Wei Bi","Freda Shi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2309.01219v1.pdf","comment":"work in progress; 32 pages"},{"id":"http://arxiv.org/abs/2309.01196v1","updated":"2023-09-03T15:07:24Z","published":"2023-09-03T15:07:24Z","title":"A Visual Interpretation-Based Self-Improved Classification System Using\n Virtual Adversarial Training","summary":" The successful application of large pre-trained models such as BERT in\nnatural language processing has attracted more attention from researchers.\nSince the BERT typically acts as an end-to-end black box, classification\nsystems based on it usually have difficulty in interpretation and low\nrobustness. This paper proposes a visual interpretation-based self-improving\nclassification model with a combination of virtual adversarial training (VAT)\nand BERT models to address the above problems. Specifically, a fine-tuned BERT\nmodel is used as a classifier to classify the sentiment of the text. Then, the\npredicted sentiment classification labels are used as part of the input of\nanother BERT for spam classification via a semi-supervised training manner\nusing VAT. Additionally, visualization techniques, including visualizing the\nimportance of words and normalizing the attention head matrix, are employed to\nanalyze the relevance of each component to classification accuracy. Moreover,\nbrand-new features will be found in the visual analysis, and classification\nperformance will be improved. Experimental results on Twitter's tweet dataset\ndemonstrate the effectiveness of the proposed model on the classification task.\nFurthermore, the ablation study results illustrate the effect of different\ncomponents of the proposed model on the classification results.\n","authors":["Shuai Jiang","Sayaka Kamei","Chen Li","Shengzhe Hou","Yasuhiko Morimoto"],"pdf_url":"https://arxiv.org/pdf/2309.01196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12671v2","updated":"2023-09-03T14:50:34Z","published":"2023-03-22T15:49:33Z","title":"Integrating Image Features with Convolutional Sequence-to-sequence\n Network for Multilingual Visual Question Answering","summary":" Visual Question Answering (VQA) is a task that requires computers to give\ncorrect answers for the input questions based on the images. This task can be\nsolved by humans with ease but is a challenge for computers. The\nVLSP2022-EVJVQA shared task carries the Visual Question Answering task in the\nmultilingual domain on a newly released dataset: UIT-EVJVQA, in which the\nquestions and answers are written in three different languages: English,\nVietnamese and Japanese. We approached the challenge as a sequence-to-sequence\nlearning task, in which we integrated hints from pre-trained state-of-the-art\nVQA models and image features with Convolutional Sequence-to-Sequence network\nto generate the desired answers. Our results obtained up to 0.3442 by F1 score\non the public test set, 0.4210 on the private test set, and placed 3rd in the\ncompetition.\n","authors":["Triet Minh Thai","Son T. Luu"],"pdf_url":"https://arxiv.org/pdf/2303.12671v2.pdf","comment":"VLSP2022-EVJVQA"},{"id":"http://arxiv.org/abs/2309.01157v1","updated":"2023-09-03T12:33:47Z","published":"2023-09-03T12:33:47Z","title":"Large Language Models for Generative Recommendation: A Survey and\n Visionary Discussions","summary":" Recent years have witnessed the wide adoption of large language models (LLM)\nin different fields, especially natural language processing and computer\nvision. Such a trend can also be observed in recommender systems (RS). However,\nmost of related work treat LLM as a component of the conventional\nrecommendation pipeline (e.g., as a feature extractor) which may not be able to\nfully leverage the generative power of LLM. Instead of separating the\nrecommendation process into multiple stages such as score computation and\nre-ranking, this process can be simplified to one stage with LLM: directly\ngenerating recommendations from the complete pool of items. This survey reviews\nthe progress, methods and future directions of LLM-based generative\nrecommendation by examining three questions: 1) What generative recommendation\nis, 2) Why RS should advance to generative recommendation, and 3) How to\nimplement LLM-based generative recommendation for various RS tasks. We hope\nthat the survey can provide the context and guidance needed to explore this\ninteresting and emerging topic.\n","authors":["Lei Li","Yongfeng Zhang","Dugang Liu","Li Chen"],"pdf_url":"https://arxiv.org/pdf/2309.01157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01131v1","updated":"2023-09-03T10:14:34Z","published":"2023-09-03T10:14:34Z","title":"Attention Where It Matters: Rethinking Visual Document Understanding\n with Selective Region Concentration","summary":" We propose a novel end-to-end document understanding model called SeRum\n(SElective Region Understanding Model) for extracting meaningful information\nfrom document images, including document analysis, retrieval, and office\nautomation.\n Unlike state-of-the-art approaches that rely on multi-stage technical schemes\nand are computationally expensive,\n SeRum converts document image understanding and recognition tasks into a\nlocal decoding process of the visual tokens of interest, using a content-aware\ntoken merge module.\n This mechanism enables the model to pay more attention to regions of interest\ngenerated by the query decoder, improving the model's effectiveness and\nspeeding up the decoding speed of the generative scheme.\n We also designed several pre-training tasks to enhance the understanding and\nlocal awareness of the model.\n Experimental results demonstrate that SeRum achieves state-of-the-art\nperformance on document understanding tasks and competitive results on text\nspotting tasks.\n SeRum represents a substantial advancement towards enabling efficient and\neffective end-to-end document understanding.\n","authors":["Haoyu Cao","Changcun Bao","Chaohu Liu","Huang Chen","Kun Yin","Hao Liu","Yinsong Liu","Deqiang Jiang","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2309.01131v1.pdf","comment":"Accepted to ICCV 2023 main conference"},{"id":"http://arxiv.org/abs/2308.16824v2","updated":"2023-09-03T08:30:29Z","published":"2023-08-31T15:53:51Z","title":"Can Programming Languages Boost Each Other via Instruction Tuning?","summary":" When human programmers have mastered a programming language, it would be\neasier when they learn a new programming language. In this report, we focus on\nexploring whether programming languages can boost each other during the\ninstruction fine-tuning phase of code large language models. We conduct\nextensive experiments of 8 popular programming languages (Python, JavaScript,\nTypeScript, C, C++, Java, Go, HTML) on StarCoder. Results demonstrate that\nprogramming languages can significantly improve each other. For example,\nCodeM-Python 15B trained on Python is able to increase Java by an absolute\n17.95% pass@1 on HumanEval-X. More surprisingly, we found that CodeM-HTML 7B\ntrained on the HTML corpus can improve Java by an absolute 15.24% pass@1. Our\ntraining data is released at https://github.com/NL2Code/CodeM.\n","authors":["Daoguang Zan","Ailun Yu","Bo Shen","Jiaxin Zhang","Taihong Chen","Bing Geng","Bei Chen","Jichuan Ji","Yafen Yao","Yongji Wang","Qianxiang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16824v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.01114v1","updated":"2023-09-03T08:08:15Z","published":"2023-09-03T08:08:15Z","title":"MedChatZH: a Better Medical Adviser Learns from Better Instructions","summary":" Generative large language models (LLMs) have shown great success in various\napplications, including question-answering (QA) and dialogue systems. However,\nin specialized domains like traditional Chinese medical QA, these models may\nperform unsatisfactorily without fine-tuning on domain-specific datasets. To\naddress this, we introduce MedChatZH, a dialogue model designed specifically\nfor traditional Chinese medical QA. Our model is pre-trained on Chinese\ntraditional medical books and fine-tuned with a carefully curated medical\ninstruction dataset. It outperforms several solid baselines on a real-world\nmedical dialogue dataset. We release our model, code, and dataset on\nhttps://github.com/tyang816/MedChatZH to facilitate further research in the\ndomain of traditional Chinese medicine and LLMs.\n","authors":["Yang Tan","Mingchen Li","Zijie Huang","Huiqun Yu","Guisheng Fan"],"pdf_url":"https://arxiv.org/pdf/2309.01114v1.pdf","comment":"7 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.01105v1","updated":"2023-09-03T07:03:17Z","published":"2023-09-03T07:03:17Z","title":"A Study on the Implementation of Generative AI Services Using an\n Enterprise Data-Based LLM Application Architecture","summary":" This study presents a method for implementing generative AI services by\nutilizing the Large Language Model (LLM) application architecture. With recent\nadvancements in generative AI technology, LLMs have gained prominence across\nvarious domains. In this context, the research addresses the challenge of\ninformation scarcity and proposes specific remedies by harnessing LLM\ncapabilities. The investigation delves into strategies for mitigating the issue\nof inadequate data, offering tailored solutions. The study delves into the\nefficacy of employing fine-tuning techniques and direct document integration to\nalleviate data insufficiency. A significant contribution of this work is the\ndevelopment of a Retrieval-Augmented Generation (RAG) model, which tackles the\naforementioned challenges. The RAG model is carefully designed to enhance\ninformation storage and retrieval processes, ensuring improved content\ngeneration. The research elucidates the key phases of the information storage\nand retrieval methodology underpinned by the RAG model. A comprehensive\nanalysis of these steps is undertaken, emphasizing their significance in\naddressing the scarcity of data. The study highlights the efficacy of the\nproposed method, showcasing its applicability through illustrative instances.\nBy implementing the RAG model for information storage and retrieval, the\nresearch not only contributes to a deeper comprehension of generative AI\ntechnology but also facilitates its practical usability within enterprises\nutilizing LLMs. This work holds substantial value in advancing the field of\ngenerative AI, offering insights into enhancing data-driven content generation\nand fostering active utilization of LLM-based services within corporate\nsettings.\n","authors":["Cheonsu Jeong"],"pdf_url":"https://arxiv.org/pdf/2309.01105v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06875v3","updated":"2023-09-03T06:55:28Z","published":"2023-04-14T00:45:01Z","title":"Research without Re-search: Maximal Update Parametrization Yields\n Accurate Loss Prediction across Scales","summary":" As language models scale up, it becomes increasingly expensive to verify\nresearch ideas because conclusions on small models do not trivially transfer to\nlarge ones. A possible solution is to establish a generic system that directly\npredicts some metrics for large models solely based on the results and\nhyperparameters from small models. Existing methods based on scaling laws\nrequire hyperparameter search on the largest models, which is impractical with\nlimited resources. We address this issue by presenting our discoveries\nindicating that Maximal Update parametrization (Mup) enables accurate fitting\nof scaling laws for hyperparameters close to common loss basins, without any\nsearch. Thus, different models can be directly compared on large scales with\nloss prediction even before the training starts. We propose a new paradigm as a\nfirst step towards reliable academic research for any model scale without heavy\ncomputation. Code is publicly available at\nhttps://github.com/cofe-ai/Mu-scaling.\n","authors":["Yiqun Yao","Yequan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.06875v3.pdf","comment":"Code is publicly available at https://github.com/cofe-ai/Mu-scaling"},{"id":"http://arxiv.org/abs/2309.01071v1","updated":"2023-09-03T04:19:02Z","published":"2023-09-03T04:19:02Z","title":"Business Process Text Sketch Automation Generation Using Large Language\n Model","summary":" Business Process Management (BPM) is gaining increasing attention as it has\nthe potential to cut costs while boosting output and quality. Business process\ndocument generation is a crucial stage in BPM. However, due to a shortage of\ndatasets, data-driven deep learning techniques struggle to deliver the expected\nresults. We propose an approach to transform Conditional Process Trees (CPTs)\ninto Business Process Text Sketches (BPTSs) using Large Language Models (LLMs).\nThe traditional prompting approach (Few-shot In-Context Learning) tries to get\nthe correct answer in one go, and it can find the pattern of transforming\nsimple CPTs into BPTSs, but for close-domain and CPTs with complex hierarchy,\nthe traditional prompts perform weakly and with low correctness. We suggest\nusing this technique to break down a difficult CPT into a number of basic CPTs\nand then solve each one in turn, drawing inspiration from the\ndivide-and-conquer strategy. We chose 100 process trees with depths ranging\nfrom 2 to 5 at random, as well as CPTs with many nodes, many degrees of\nselection, and cyclic nesting. Experiments show that our method can achieve a\ncorrect rate of 93.42%, which is 45.17% better than traditional prompting\nmethods. Our proposed method provides a solution for business process document\ngeneration in the absence of datasets, and secondly, it becomes potentially\npossible to provide a large number of datasets for the process model extraction\n(PME) domain.\n","authors":["Rui Zhu","Quanzhou Hu","Wenxin Li","Honghao Xiao","Chaogang Wang","Zixin Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.01071v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2212.10313v2","updated":"2023-09-03T03:46:05Z","published":"2022-12-20T15:02:38Z","title":"Beyond Triplet: Leveraging the Most Data for Multimodal Machine\n Translation","summary":" Multimodal machine translation (MMT) aims to improve translation quality by\nincorporating information from other modalities, such as vision. Previous MMT\nsystems mainly focus on better access and use of visual information and tend to\nvalidate their methods on image-related datasets. These studies face two\nchallenges. First, they can only utilize triple data (bilingual texts with\nimages), which is scarce; second, current benchmarks are relatively restricted\nand do not correspond to realistic scenarios. Therefore, this paper\ncorrespondingly establishes new methods and new datasets for MMT. First, we\npropose a framework 2/3-Triplet with two new approaches to enhance MMT by\nutilizing large-scale non-triple data: monolingual image-text data and parallel\ntext-only data. Second, we construct an English-Chinese {e}-commercial\n{m}ulti{m}odal {t}ranslation dataset (including training and testing), named\nEMMT, where its test set is carefully selected as some words are ambiguous and\nshall be translated mistakenly without the help of images. Experiments show\nthat our method is more suitable for real-world scenarios and can significantly\nimprove translation performance by using more non-triple data. In addition, our\nmodel also rivals various SOTA models in conventional multimodal translation\nbenchmarks.\n","authors":["Yaoming Zhu","Zewei Sun","Shanbo Cheng","Luyang Huang","Liwei Wu","Mingxuan Wang"],"pdf_url":"https://arxiv.org/pdf/2212.10313v2.pdf","comment":"8 pages, ACL 2023 Finding"},{"id":"http://arxiv.org/abs/2307.07362v2","updated":"2023-09-03T01:59:15Z","published":"2023-07-14T14:08:54Z","title":"A scoping review on multimodal deep learning in biomedical images and\n texts","summary":" Computer-assisted diagnostic and prognostic systems of the future should be\ncapable of simultaneously processing multimodal data. Multimodal deep learning\n(MDL), which involves the integration of multiple sources of data, such as\nimages and text, has the potential to revolutionize the analysis and\ninterpretation of biomedical data. However, it only caught researchers'\nattention recently. To this end, there is a critical need to conduct a\nsystematic review on this topic, identify the limitations of current work, and\nexplore future directions. In this scoping review, we aim to provide a\ncomprehensive overview of the current state of the field and identify key\nconcepts, types of studies, and research gaps with a focus on biomedical images\nand texts joint learning, mainly because these two were the most commonly\navailable data types in MDL research. This study reviewed the current uses of\nmultimodal deep learning on five tasks: (1) Report generation, (2) Visual\nquestion answering, (3) Cross-modal retrieval, (4) Computer-aided diagnosis,\nand (5) Semantic segmentation. Our results highlight the diverse applications\nand potential of MDL and suggest directions for future research in the field.\nWe hope our review will facilitate the collaboration of natural language\nprocessing (NLP) and medical imaging communities and support the next\ngeneration of decision-making and computer-assisted diagnostic system\ndevelopment.\n","authors":["Zhaoyi Sun","Mingquan Lin","Qingqing Zhu","Qianqian Xie","Fei Wang","Zhiyong Lu","Yifan Peng"],"pdf_url":"https://arxiv.org/pdf/2307.07362v2.pdf","comment":"This paper has been accepted by the Journal of Biomedical Informatics"},{"id":"http://arxiv.org/abs/2308.06912v2","updated":"2023-09-03T00:31:07Z","published":"2023-08-14T03:14:38Z","title":"CausalLM is not optimal for in-context learning","summary":" Recent empirical evidence indicates that transformer based in-context\nlearning performs better when using a prefix language model (prefixLM), in\nwhich in-context samples can all attend to each other, compared to causal\nlanguage models (causalLM), which use auto-regressive attention that prohibits\nin-context samples to attend to future samples. While this result is intuitive,\nit is not understood from a theoretical perspective. In this paper we take a\ntheoretical approach and analyze the convergence behavior of prefixLM and\ncausalLM under a certain parameter construction. Our analysis shows that both\nLM types converge to their stationary points at a linear rate, but that while\nprefixLM converges to the optimal solution of linear regression, causalLM\nconvergence dynamics follows that of an online gradient descent algorithm,\nwhich is not guaranteed to be optimal even as the number of samples grows\ninfinitely. We supplement our theoretical claims with empirical experiments\nover synthetic and real tasks and using various types of transformers. Our\nexperiments verify that causalLM consistently underperforms prefixLM in all\nsettings.\n","authors":["Nan Ding","Tomer Levinboim","Jialin Wu","Sebastian Goodman","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2308.06912v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.01188v1","updated":"2023-09-03T14:18:31Z","published":"2023-09-03T14:18:31Z","title":"Pre-trained Neural Recommenders: A Transferable Zero-Shot Framework for\n Recommendation Systems","summary":" Modern neural collaborative filtering techniques are critical to the success\nof e-commerce, social media, and content-sharing platforms. However, despite\ntechnical advances -- for every new application domain, we need to train an NCF\nmodel from scratch. In contrast, pre-trained vision and language models are\nroutinely applied to diverse applications directly (zero-shot) or with limited\nfine-tuning. Inspired by the impact of pre-trained models, we explore the\npossibility of pre-trained recommender models that support building recommender\nsystems in new domains, with minimal or no retraining, without the use of any\nauxiliary user or item information. Zero-shot recommendation without auxiliary\ninformation is challenging because we cannot form associations between users\nand items across datasets when there are no overlapping users or items. Our\nfundamental insight is that the statistical characteristics of the user-item\ninteraction matrix are universally available across different domains and\ndatasets. Thus, we use the statistical characteristics of the user-item\ninteraction matrix to identify dataset-independent representations for users\nand items. We show how to learn universal (i.e., supporting zero-shot\nadaptation without user or item auxiliary information) representations for\nnodes and edges from the bipartite user-item interaction graph. We learn\nrepresentations by exploiting the statistical properties of the interaction\ndata, including user and item marginals, and the size and density distributions\nof their clusters.\n","authors":["Junting Wang","Adit Krishnan","Hari Sundaram","Yunzhe Li"],"pdf_url":"https://arxiv.org/pdf/2309.01188v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01157v1","updated":"2023-09-03T12:33:47Z","published":"2023-09-03T12:33:47Z","title":"Large Language Models for Generative Recommendation: A Survey and\n Visionary Discussions","summary":" Recent years have witnessed the wide adoption of large language models (LLM)\nin different fields, especially natural language processing and computer\nvision. Such a trend can also be observed in recommender systems (RS). However,\nmost of related work treat LLM as a component of the conventional\nrecommendation pipeline (e.g., as a feature extractor) which may not be able to\nfully leverage the generative power of LLM. Instead of separating the\nrecommendation process into multiple stages such as score computation and\nre-ranking, this process can be simplified to one stage with LLM: directly\ngenerating recommendations from the complete pool of items. This survey reviews\nthe progress, methods and future directions of LLM-based generative\nrecommendation by examining three questions: 1) What generative recommendation\nis, 2) Why RS should advance to generative recommendation, and 3) How to\nimplement LLM-based generative recommendation for various RS tasks. We hope\nthat the survey can provide the context and guidance needed to explore this\ninteresting and emerging topic.\n","authors":["Lei Li","Yongfeng Zhang","Dugang Liu","Li Chen"],"pdf_url":"https://arxiv.org/pdf/2309.01157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01103v1","updated":"2023-09-03T06:56:45Z","published":"2023-09-03T06:56:45Z","title":"Multi-Relational Contrastive Learning for Recommendation","summary":" Personalized recommender systems play a crucial role in capturing users'\nevolving preferences over time to provide accurate and effective\nrecommendations on various online platforms. However, many recommendation\nmodels rely on a single type of behavior learning, which limits their ability\nto represent the complex relationships between users and items in real-life\nscenarios. In such situations, users interact with items in multiple ways,\nincluding clicking, tagging as favorite, reviewing, and purchasing. To address\nthis issue, we propose the Relation-aware Contrastive Learning (RCL) framework,\nwhich effectively models dynamic interaction heterogeneity. The RCL model\nincorporates a multi-relational graph encoder that captures short-term\npreference heterogeneity while preserving the dedicated relation semantics for\ndifferent types of user-item interactions. Moreover, we design a dynamic\ncross-relational memory network that enables the RCL model to capture users'\nlong-term multi-behavior preferences and the underlying evolving cross-type\nbehavior dependencies over time. To obtain robust and informative user\nrepresentations with both commonality and diversity across multi-behavior\ninteractions, we introduce a multi-relational contrastive learning paradigm\nwith heterogeneous short- and long-term interest modeling. Our extensive\nexperimental studies on several real-world datasets demonstrate the superiority\nof the RCL recommender system over various state-of-the-art baselines in terms\nof recommendation accuracy and effectiveness.\n","authors":["Wei Wei","Lianghao Xia","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2309.01103v1.pdf","comment":"This paper has been published as a full paper at RecSys 2023"},{"id":"http://arxiv.org/abs/2210.14309v3","updated":"2023-09-03T06:03:01Z","published":"2022-10-25T20:11:49Z","title":"Empowering Long-tail Item Recommendation through Cross Decoupling\n Network (CDN)","summary":" Industry recommender systems usually suffer from highly-skewed long-tail item\ndistributions where a small fraction of the items receives most of the user\nfeedback. This skew hurts recommender quality especially for the item slices\nwithout much user feedback. While there have been many research advances made\nin academia, deploying these methods in production is very difficult and very\nfew improvements have been made in industry. One challenge is that these\nmethods often hurt overall performance; additionally, they could be complex and\nexpensive to train and serve. In this work, we aim to improve tail item\nrecommendations while maintaining the overall performance with less training\nand serving cost. We first find that the predictions of user preferences are\nbiased under long-tail distributions. The bias comes from the differences\nbetween training and serving data in two perspectives: 1) the item\ndistributions, and 2) user's preference given an item. Most existing methods\nmainly attempt to reduce the bias from the item distribution perspective,\nignoring the discrepancy from user preference given an item. This leads to a\nsevere forgetting issue and results in sub-optimal performance.\n To address the problem, we design a novel Cross Decoupling Network (CDN) (i)\ndecouples the learning process of memorization and generalization on the item\nside through a mixture-of-expert architecture; (ii) decouples the user samples\nfrom different distributions through a regularized bilateral branch network.\nFinally, a new adapter is introduced to aggregate the decoupled vectors, and\nsoftly shift the training attention to tail items. Extensive experimental\nresults show that CDN significantly outperforms state-of-the-art approaches on\nbenchmark datasets. We also demonstrate its effectiveness by a case study of\nCDN in a large-scale recommendation system at Google.\n","authors":["Yin Zhang","Ruoxi Wang","Tiansheng Yao","Xinyang Yi","Lichan Hong","James Caverlee","Ed H. Chi","Derek Zhiyuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2210.14309v3.pdf","comment":"Accepted by KDD 2023 Applied Data Science (ADS) track"}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.01202v1","updated":"2023-09-03T15:21:47Z","published":"2023-09-03T15:21:47Z","title":"MAGMA: Music Aligned Generative Motion Autodecoder","summary":" Mapping music to dance is a challenging problem that requires spatial and\ntemporal coherence along with a continual synchronization with the music's\nprogression. Taking inspiration from large language models, we introduce a\n2-step approach for generating dance using a Vector Quantized-Variational\nAutoencoder (VQ-VAE) to distill motion into primitives and train a Transformer\ndecoder to learn the correct sequencing of these primitives. We also evaluate\nthe importance of music representations by comparing naive music feature\nextraction using Librosa to deep audio representations generated by\nstate-of-the-art audio compression algorithms. Additionally, we train\nvariations of the motion generator using relative and absolute positional\nencodings to determine the effect on generated motion quality when generating\narbitrarily long sequence lengths. Our proposed approach achieve\nstate-of-the-art results in music-to-motion generation benchmarks and enables\nthe real-time generation of considerably longer motion sequences, the ability\nto chain multiple motion sequences seamlessly, and easy customization of motion\nsequences to meet style requirements.\n","authors":["Sohan Anisetty","Amit Raj","James Hays"],"pdf_url":"https://arxiv.org/pdf/2309.01202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.12573v5","updated":"2023-09-03T07:16:17Z","published":"2022-09-26T10:38:39Z","title":"Faked Speech Detection with Zero Knowledge","summary":" Audio is one of the most used ways of human communication, but at the same\ntime it can be easily misused to trick people. With the revolution of AI, the\nrelated technologies are now accessible to almost everyone thus making it\nsimple for the criminals to commit crimes and forgeries. In this work, we\nintroduce a neural network method to develop a classifier that will blindly\nclassify an input audio as real or mimicked; the word 'blindly' refers to the\nability to detect mimicked audio without references or real sources. The\nproposed model was trained on a set of important features extracted from a\nlarge dataset of audios to get a classifier that was tested on the same set of\nfeatures from different audios. The data was extracted from two raw datasets,\nespecially composed for this work; an all English dataset and a mixed dataset\n(Arabic plus English). These datasets have been made available, in raw form,\nthrough GitHub for the use of the research community at\nhttps://github.com/SaSs7/Dataset. For the purpose of comparison, the audios\nwere also classified through human inspection with the subjects being the\nnative speakers. The ensued results were interesting and exhibited formidable\naccuracy.\n","authors":["Sahar Al Ajmi","Khizar Hayat","Alaa M. Al Obaidi","Naresh Kumar","Munaf Najmuldeen","Baptiste Magnier"],"pdf_url":"https://arxiv.org/pdf/2209.12573v5.pdf","comment":"14 pages, 4 figures (6 if you count subfigures), 2 tables"},{"id":"http://arxiv.org/abs/2309.01104v1","updated":"2023-09-03T07:01:34Z","published":"2023-09-03T07:01:34Z","title":"Turn Fake into Real: Adversarial Head Turn Attacks Against Deepfake\n Detection","summary":" Malicious use of deepfakes leads to serious public concerns and reduces\npeople's trust in digital media. Although effective deepfake detectors have\nbeen proposed, they are substantially vulnerable to adversarial attacks. To\nevaluate the detector's robustness, recent studies have explored various\nattacks. However, all existing attacks are limited to 2D image perturbations,\nwhich are hard to translate into real-world facial changes. In this paper, we\npropose adversarial head turn (AdvHeat), the first attempt at 3D adversarial\nface views against deepfake detectors, based on face view synthesis from a\nsingle-view fake image. Extensive experiments validate the vulnerability of\nvarious detectors to AdvHeat in realistic, black-box scenarios. For example,\nAdvHeat based on a simple random search yields a high attack success rate of\n96.8% with 360 searching steps. When additional query access is allowed, we can\nfurther reduce the step budget to 50. Additional analyses demonstrate that\nAdvHeat is better than conventional attacks on both the cross-detector\ntransferability and robustness to defenses. The adversarial images generated by\nAdvHeat are also shown to have natural looks. Our code, including that for\ngenerating a multi-view dataset consisting of 360 synthetic views for each of\n1000 IDs from FaceForensics++, is available at\nhttps://github.com/twowwj/AdvHeaT.\n","authors":["Weijie Wang","Zhengyu Zhao","Nicu Sebe","Bruno Lepri"],"pdf_url":"https://arxiv.org/pdf/2309.01104v1.pdf","comment":null}]},"2023-09-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2207.10802v3","updated":"2023-09-02T22:33:09Z","published":"2022-07-14T05:03:56Z","title":"Combing for Credentials: Active Pattern Extraction from Smart Reply","summary":" Pre-trained large language models, such as GPT\\nobreakdash-2 and BERT, are\noften fine-tuned to achieve state-of-the-art performance on a downstream task.\nOne natural example is the ``Smart Reply'' application where a pre-trained\nmodel is tuned to provide suggested responses for a given query message. Since\nthe tuning data is often sensitive data such as emails or chat transcripts, it\nis important to understand and mitigate the risk that the model leaks its\ntuning data. We investigate potential information leakage vulnerabilities in a\ntypical Smart Reply pipeline. We consider a realistic setting where the\nadversary can only interact with the underlying model through a front-end\ninterface that constrains what types of queries can be sent to the model.\nPrevious attacks do not work in these settings, but require the ability to send\nunconstrained queries directly to the model. Even when there are no constraints\non the queries, previous attacks typically require thousands, or even millions,\nof queries to extract useful information, while our attacks can extract\nsensitive data in just a handful of queries. We introduce a new type of active\nextraction attack that exploits canonical patterns in text containing sensitive\ndata. We show experimentally that it is possible for an adversary to extract\nsensitive user information present in the training data, even in realistic\nsettings where all interactions with the model must go through a front-end that\nlimits the types of queries. We explore potential mitigation strategies and\ndemonstrate empirically how differential privacy appears to be a reasonably\neffective defense mechanism to such pattern extraction attacks.\n","authors":["Bargav Jayaraman","Esha Ghosh","Melissa Chase","Sambuddha Roy","Wei Dai","David Evans"],"pdf_url":"https://arxiv.org/pdf/2207.10802v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01029v1","updated":"2023-09-02T22:14:26Z","published":"2023-09-02T22:14:26Z","title":"Explainability for Large Language Models: A Survey","summary":" Large language models (LLMs) have demonstrated impressive capabilities in\nnatural language processing. However, their internal mechanisms are still\nunclear and this lack of transparency poses unwanted risks for downstream\napplications. Therefore, understanding and explaining these models is crucial\nfor elucidating their behaviors, limitations, and social impacts. In this\npaper, we introduce a taxonomy of explainability techniques and provide a\nstructured overview of methods for explaining Transformer-based language\nmodels. We categorize techniques based on the training paradigms of LLMs:\ntraditional fine-tuning-based paradigm and prompting-based paradigm. For each\nparadigm, we summarize the goals and dominant approaches for generating local\nexplanations of individual predictions and global explanations of overall model\nknowledge. We also discuss metrics for evaluating generated explanations, and\ndiscuss how explanations can be leveraged to debug models and improve\nperformance. Lastly, we examine key challenges and emerging opportunities for\nexplanation techniques in the era of LLMs in comparison to conventional machine\nlearning models.\n","authors":["Haiyan Zhao","Hanjie Chen","Fan Yang","Ninghao Liu","Huiqi Deng","Hengyi Cai","Shuaiqiang Wang","Dawei Yin","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2309.01029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01026v1","updated":"2023-09-02T21:29:53Z","published":"2023-09-02T21:29:53Z","title":"Zero-Shot Recommendations with Pre-Trained Large Language Models for\n Multimodal Nudging","summary":" We present a method for zero-shot recommendation of multimodal non-stationary\ncontent that leverages recent advancements in the field of generative AI. We\npropose rendering inputs of different modalities as textual descriptions and to\nutilize pre-trained LLMs to obtain their numerical representations by computing\nsemantic embeddings. Once unified representations of all content items are\nobtained, the recommendation can be performed by computing an appropriate\nsimilarity metric between them without any additional learning. We demonstrate\nour approach on a synthetic multimodal nudging environment, where the inputs\nconsist of tabular, textual, and visual data.\n","authors":["Rachel Harrison","Anton Dereventsov","Anton Bibin"],"pdf_url":"https://arxiv.org/pdf/2309.01026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08142v3","updated":"2023-09-02T20:35:23Z","published":"2022-10-08T22:33:39Z","title":"Semantic Representations of Mathematical Expressions in a Continuous\n Vector Space","summary":" Mathematical notation makes up a large portion of STEM literature, yet\nfinding semantic representations for formulae remains a challenging problem.\nBecause mathematical notation is precise, and its meaning changes significantly\nwith small character shifts, the methods that work for natural text do not\nnecessarily work well for mathematical expressions. This work describes an\napproach for representing mathematical expressions in a continuous vector\nspace. We use the encoder of a sequence-to-sequence architecture, trained on\nvisually different but mathematically equivalent expressions, to generate\nvector representations (or embeddings). We compare this approach with a\nstructural approach that considers visual layout to embed an expression and\nshow that our proposed approach is better at capturing mathematical semantics.\nFinally, to expedite future research, we publish a corpus of equivalent\ntranscendental and algebraic expression pairs.\n","authors":["Neeraj Gangwar","Nickvash Kani"],"pdf_url":"https://arxiv.org/pdf/2211.08142v3.pdf","comment":"Transactions on Machine Learning Research (TMLR), September 2023"},{"id":"http://arxiv.org/abs/2309.00986v1","updated":"2023-09-02T16:50:30Z","published":"2023-09-02T16:50:30Z","title":"ModelScope-Agent: Building Your Customizable Agent System with\n Open-source Large Language Models","summary":" Large language models (LLMs) have recently demonstrated remarkable\ncapabilities to comprehend human intentions, engage in reasoning, and design\nplanning-like behavior. To further unleash the power of LLMs to accomplish\ncomplex tasks, there is a growing trend to build agent framework that equips\nLLMs, such as ChatGPT, with tool-use abilities to connect with massive external\nAPIs. In this work, we introduce ModelScope-Agent, a general and customizable\nagent framework for real-world applications, based on open-source LLMs as\ncontrollers. It provides a user-friendly system library, with customizable\nengine design to support model training on multiple open-source LLMs, while\nalso enabling seamless integration with both model APIs and common APIs in a\nunified way. To equip the LLMs with tool-use abilities, a comprehensive\nframework has been proposed spanning over tool-use data collection, tool\nretrieval, tool registration, memory control, customized model training, and\nevaluation for practical real-world applications. Finally, we showcase\nModelScopeGPT, a real-world intelligent assistant of ModelScope Community based\non the ModelScope-Agent framework, which is able to connect open-source LLMs\nwith more than 1000 public AI models and localized community knowledge in\nModelScope. The ModelScope-Agent\nlibrary\\footnote{https://github.com/modelscope/modelscope-agent} and online\ndemo\\footnote{https://modelscope.cn/studios/damo/ModelScopeGPT/summary} are now\npublicly available.\n","authors":["Chenliang Li","Hehong Chen","Ming Yan","Weizhou Shen","Haiyang Xu","Zhikai Wu","Zhicheng Zhang","Wenmeng Zhou","Yingda Chen","Chen Cheng","Hongzhu Shi","Ji Zhang","Fei Huang","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.00986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00952v1","updated":"2023-09-02T14:30:56Z","published":"2023-09-02T14:30:56Z","title":"Bridge Diffusion Model: bridge non-English language-native text-to-image\n diffusion model with English communities","summary":" Text-to-Image generation (TTI) technologies are advancing rapidly, especially\nin the English language communities. However, English-native TTI models\ninherently carry biases from English world centric training data, which creates\na dilemma for development of other language-native TTI models. One common\nchoice is fine-tuning the English-native TTI model with translated samples from\nnon-English communities. It falls short of fully addressing the model bias\nproblem. Alternatively, training non-English language native models from\nscratch can effectively resolve the English world bias, but diverges from the\nEnglish TTI communities, thus not able to utilize the strides continuously\ngaining in the English TTI communities any more. To build non-English language\nnative TTI model meanwhile keep compatability with the English TTI communities,\nwe propose a novel model structure referred as \"Bridge Diffusion Model\" (BDM).\nThe proposed BDM employs a backbone-branch network structure to learn the\nnon-English language semantics while keep the latent space compatible with the\nEnglish-native TTI backbone, in an end-to-end manner. The unique advantages of\nthe proposed BDM are that it's not only adept at generating images that\nprecisely depict non-English language semantics, but also compatible with\nvarious English-native TTI plugins, such as different checkpoints, LoRA,\nControlNet, Dreambooth, and Textual Inversion, etc. Moreover, BDM can\nconcurrently generate content seamlessly combining both non-English native and\nEnglish-native semantics within a single image, fostering cultural interaction.\nWe verify our method by applying BDM to build a Chinese-native TTI model,\nwhereas the method is generic and applicable to any other language.\n","authors":["Shanyuan Liu","Dawei Leng","Yuhui Yin"],"pdf_url":"https://arxiv.org/pdf/2309.00952v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00949v1","updated":"2023-09-02T14:21:22Z","published":"2023-09-02T14:21:22Z","title":"Multilingual Text Representation","summary":" Modern NLP breakthrough includes large multilingual models capable of\nperforming tasks across more than 100 languages. State-of-the-art language\nmodels came a long way, starting from the simple one-hot representation of\nwords capable of performing tasks like natural language understanding,\ncommon-sense reasoning, or question-answering, thus capturing both the syntax\nand semantics of texts. At the same time, language models are expanding beyond\nour known language boundary, even competitively performing over very\nlow-resource dialects of endangered languages. However, there are still\nproblems to solve to ensure an equitable representation of texts through a\nunified modeling space across language and speakers. In this survey, we shed\nlight on this iterative progression of multilingual text representation and\ndiscuss the driving factors that ultimately led to the current\nstate-of-the-art. Subsequently, we discuss how the full potential of language\ndemocratization could be obtained, reaching beyond the known limits and what is\nthe scope of improvement in that space.\n","authors":["Fahim Faisal"],"pdf_url":"https://arxiv.org/pdf/2309.00949v1.pdf","comment":"PhD Comprehensive exam report"},{"id":"http://arxiv.org/abs/2307.11772v2","updated":"2023-09-02T14:18:40Z","published":"2023-07-18T04:43:24Z","title":"AutoAlign: Fully Automatic and Effective Knowledge Graph Alignment\n enabled by Large Language Models","summary":" The task of entity alignment between knowledge graphs (KGs) aims to identify\nevery pair of entities from two different KGs that represent the same entity.\nMany machine learning-based methods have been proposed for this task. However,\nto our best knowledge, existing methods all require manually crafted seed\nalignments, which are expensive to obtain. In this paper, we propose the first\nfully automatic alignment method named AutoAlign, which does not require any\nmanually crafted seed alignments. Specifically, for predicate embeddings,\nAutoAlign constructs a predicate-proximity-graph with the help of large\nlanguage models to automatically capture the similarity between predicates\nacross two KGs. For entity embeddings, AutoAlign first computes the entity\nembeddings of each KG independently using TransE, and then shifts the two KGs'\nentity embeddings into the same vector space by computing the similarity\nbetween entities based on their attributes. Thus, both predicate alignment and\nentity alignment can be done without manually crafted seed alignments.\nAutoAlign is not only fully automatic, but also highly effective. Experiments\nusing real-world KGs show that AutoAlign improves the performance of entity\nalignment significantly compared to state-of-the-art methods.\n","authors":["Rui Zhang","Yixin Su","Bayu Distiawan Trisedya","Xiaoyan Zhao","Min Yang","Hong Cheng","Jianzhong Qi"],"pdf_url":"https://arxiv.org/pdf/2307.11772v2.pdf","comment":"14 pages, 5 figures, 4 tables. arXiv admin note: substantial text\n overlap with arXiv:2210.08540"},{"id":"http://arxiv.org/abs/2309.00917v1","updated":"2023-09-02T11:46:41Z","published":"2023-09-02T11:46:41Z","title":"Knowledge Graph Embeddings for Multi-Lingual Structured Representations\n of Radiology Reports","summary":" The way we analyse clinical texts has undergone major changes over the last\nyears. The introduction of language models such as BERT led to adaptations for\nthe (bio)medical domain like PubMedBERT and ClinicalBERT. These models rely on\nlarge databases of archived medical documents. While performing well in terms\nof accuracy, both the lack of interpretability and limitations to transfer\nacross languages limit their use in clinical setting. We introduce a novel\nlight-weight graph-based embedding method specifically catering radiology\nreports. It takes into account the structure and composition of the report,\nwhile also connecting medical terms in the report through the multi-lingual\nSNOMED Clinical Terms knowledge base. The resulting graph embedding uncovers\nthe underlying relationships among clinical terms, achieving a representation\nthat is better understandable for clinicians and clinically more accurate,\nwithout reliance on large pre-training datasets. We show the use of this\nembedding on two tasks namely disease classification of X-ray reports and image\nclassification. For disease classification our model is competitive with its\nBERT-based counterparts, while being magnitudes smaller in size and training\ndata requirements. For image classification, we show the effectiveness of the\ngraph embedding leveraging cross-modal knowledge transfer and show how this\nmethod is usable across different languages.\n","authors":["Tom van Sonsbeek","Xiantong Zhen","Marcel Warring"],"pdf_url":"https://arxiv.org/pdf/2309.00917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00916v1","updated":"2023-09-02T11:46:05Z","published":"2023-09-02T11:46:05Z","title":"BLSP: Bootstrapping Language-Speech Pre-training via Behavior Alignment\n of Continuation Writing","summary":" The emergence of large language models (LLMs) has sparked significant\ninterest in extending their remarkable language capabilities to speech.\nHowever, modality alignment between speech and text still remains an open\nproblem. Current solutions can be categorized into two strategies. One is a\ncascaded approach where outputs (tokens or states) of a separately trained\nspeech recognition system are used as inputs for LLMs, which limits their\npotential in modeling alignment between speech and text. The other is an\nend-to-end approach that relies on speech instruction data, which is very\ndifficult to collect in large quantities. In this paper, we address these\nissues and propose the BLSP approach that Bootstraps Language-Speech\nPre-training via behavior alignment of continuation writing. We achieve this by\nlearning a lightweight modality adapter between a frozen speech encoder and an\nLLM, ensuring that the LLM exhibits the same generation behavior regardless of\nthe modality of input: a speech segment or its transcript. The training process\ncan be divided into two steps. The first step prompts an LLM to generate texts\nwith speech transcripts as prefixes, obtaining text continuations. In the\nsecond step, these continuations are used as supervised signals to train the\nmodality adapter in an end-to-end manner. We demonstrate that this\nstraightforward process can extend the capabilities of LLMs to speech, enabling\nspeech recognition, speech translation, spoken language understanding, and\nspeech conversation, even in zero-shot cross-lingual scenarios.\n","authors":["Chen Wang","Minpeng Liao","Zhongqiang Huang","Jinliang Lu","Junhong Wu","Yuchen Liu","Chengqing Zong","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.00916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00857v1","updated":"2023-09-02T08:17:29Z","published":"2023-09-02T08:17:29Z","title":"Evaluating Transformer's Ability to Learn Mildly Context-Sensitive\n Languages","summary":" Despite that Transformers perform well in NLP tasks, recent studies suggest\nthat self-attention is theoretically limited in learning even some regular and\ncontext-free languages. These findings motivated us to think about their\nimplications in modeling natural language, which is hypothesized to be mildly\ncontext-sensitive. We test Transformer's ability to learn a variety of mildly\ncontext-sensitive languages of varying complexities, and find that they\ngeneralize well to unseen in-distribution data, but their ability to\nextrapolate to longer strings is worse than that of LSTMs. Our analyses show\nthat the learned self-attention patterns and representations modeled dependency\nrelations and demonstrated counting behavior, which may have helped the models\nsolve the languages.\n","authors":["Shunjie Wang","Shane Steinert-Threlkeld"],"pdf_url":"https://arxiv.org/pdf/2309.00857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06954v3","updated":"2023-09-02T07:12:42Z","published":"2023-07-12T20:33:30Z","title":"ACTI at EVALITA 2023: Overview of the Conspiracy Theory Identification\n Task","summary":" Conspiracy Theory Identication task is a new shared task proposed for the\nfirst time at the Evalita 2023. The ACTI challenge, based exclusively on\ncomments published on conspiratorial channels of telegram, is divided into two\nsubtasks: (i) Conspiratorial Content Classification: identifying conspiratorial\ncontent and (ii) Conspiratorial Category Classification about specific\nconspiracy theory classification. A total of fifteen teams participated in the\ntask for a total of 81 submissions. We illustrate the best performing\napproaches were based on the utilization of large language models. We finally\ndraw conclusions about the utilization of these models for counteracting the\nspreading of misinformation in online platforms.\n","authors":["Giuseppe Russo","Niklas Stoehr","Manoel Horta Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2307.06954v3.pdf","comment":"Accepted at the Evalita Workshop 2023"},{"id":"http://arxiv.org/abs/2309.00841v1","updated":"2023-09-02T06:33:18Z","published":"2023-09-02T06:33:18Z","title":"LeanContext: Cost-Efficient Domain-Specific Question Answering Using\n LLMs","summary":" Question-answering (QA) is a significant application of Large Language Models\n(LLMs), shaping chatbot capabilities across healthcare, education, and customer\nservice. However, widespread LLM integration presents a challenge for small\nbusinesses due to the high expenses of LLM API usage. Costs rise rapidly when\ndomain-specific data (context) is used alongside queries for accurate\ndomain-specific LLM responses. One option is to summarize the context by using\nLLMs and reduce the context. However, this can also filter out useful\ninformation that is necessary to answer some domain-specific queries. In this\npaper, we shift from human-oriented summarizers to AI model-friendly summaries.\nOur approach, LeanContext, efficiently extracts $k$ key sentences from the\ncontext that are closely aligned with the query. The choice of $k$ is neither\nstatic nor random; we introduce a reinforcement learning technique that\ndynamically determines $k$ based on the query and context. The rest of the less\nimportant sentences are reduced using a free open source text reduction method.\nWe evaluate LeanContext against several recent query-aware and query-unaware\ncontext reduction approaches on prominent datasets (arxiv papers and BBC news\narticles). Despite cost reductions of $37.29\\%$ to $67.81\\%$, LeanContext's\nROUGE-1 score decreases only by $1.41\\%$ to $2.65\\%$ compared to a baseline\nthat retains the entire context (no summarization). Additionally, if free\npretrained LLM-based summarizers are used to reduce context (into human\nconsumable summaries), LeanContext can further modify the reduced context to\nenhance the accuracy (ROUGE-1 score) by $13.22\\%$ to $24.61\\%$.\n","authors":["Md Adnan Arefeen","Biplob Debnath","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.00841v1.pdf","comment":"The paper is under review"},{"id":"http://arxiv.org/abs/2211.10443v2","updated":"2023-09-02T04:40:37Z","published":"2022-11-18T05:27:59Z","title":"Social media mining for toxicovigilance of prescription medications:\n End-to-end pipeline, challenges and future work","summary":" Substance use, substance use disorder, and overdoses related to substance use\nare major public health problems globally and in the United States. A key\naspect of addressing these problems from a public health standpoint is improved\nsurveillance. Traditional surveillance systems are laggy, and social media are\npotentially useful sources of timely data. However, mining knowledge from\nsocial media is challenging, and requires the development of advanced\nartificial intelligence, specifically natural language processing (NLP) and\nmachine learning methods. We developed a sophisticated end-to-end pipeline for\nmining information about nonmedical prescription medication use from social\nmedia, namely Twitter and Reddit. Our pipeline employs supervised machine\nlearning and NLP for filtering out noise and characterizing the chatter. In\nthis paper, we describe our end-to-end pipeline developed over four years. In\naddition to describing our data mining infrastructure, we discuss existing\nchallenges in social media mining for toxicovigilance, and possible future\nresearch directions.\n","authors":["Abeed Sarker"],"pdf_url":"https://arxiv.org/pdf/2211.10443v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11592v2","updated":"2023-09-02T04:28:42Z","published":"2023-08-19T17:32:34Z","title":"UniDoc: A Universal Large Multimodal Model for Simultaneous Text\n Detection, Recognition, Spotting and Understanding","summary":" In the era of Large Language Models (LLMs), tremendous strides have been made\nin the field of multimodal understanding. However, existing advanced algorithms\nare limited to effectively utilizing the immense representation capabilities\nand rich world knowledge inherent to these large pre-trained models, and the\nbeneficial connections among tasks within the context of text-rich scenarios\nhave not been sufficiently explored. In this work, we introduce UniDoc, a novel\nmultimodal model equipped with text detection and recognition capabilities,\nwhich are deficient in existing approaches. Moreover, UniDoc capitalizes on the\nbeneficial interactions among tasks to enhance the performance of each\nindividual task. To implement UniDoc, we perform unified multimodal instruct\ntuning on the contributed large-scale instruction following datasets.\nQuantitative and qualitative experimental results show that UniDoc sets\nstate-of-the-art scores across multiple challenging benchmarks. To the best of\nour knowledge, this is the first large multimodal model capable of simultaneous\ntext detection, recognition, spotting, and understanding.\n","authors":["Hao Feng","Zijian Wang","Jingqun Tang","Jinghui Lu","Wengang Zhou","Houqiang Li","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00789v1","updated":"2023-09-02T01:45:27Z","published":"2023-09-02T01:45:27Z","title":"LinkTransformer: A Unified Package for Record Linkage with Transformer\n Language Models","summary":" Linking information across sources is fundamental to a variety of analyses in\nsocial science, business, and government. While large language models (LLMs)\noffer enormous promise for improving record linkage in noisy datasets, in many\ndomains approximate string matching packages in popular softwares such as R and\nStata remain predominant. These packages have clean, simple interfaces and can\nbe easily extended to a diversity of languages. Our open-source package\nLinkTransformer aims to extend the familiarity and ease-of-use of popular\nstring matching methods to deep learning. It is a general purpose package for\nrecord linkage with transformer LLMs that treats record linkage as a text\nretrieval problem. At its core is an off-the-shelf toolkit for applying\ntransformer models to record linkage with four lines of code. LinkTransformer\ncontains a rich repository of pre-trained transformer semantic similarity\nmodels for multiple languages and supports easy integration of any transformer\nlanguage model from Hugging Face or OpenAI. It supports standard functionality\nsuch as blocking and linking on multiple noisy fields. LinkTransformer APIs\nalso perform other common text data processing tasks, e.g., aggregation, noisy\nde-duplication, and translation-free cross-lingual linkage. Importantly,\nLinkTransformer also contains comprehensive tools for efficient model tuning,\nto facilitate different levels of customization when off-the-shelf models do\nnot provide the required accuracy. Finally, to promote reusability,\nreproducibility, and extensibility, LinkTransformer makes it easy for users to\ncontribute their custom-trained models to its model hub. By combining\ntransformer language models with intuitive APIs that will be familiar to many\nusers of popular string matching packages, LinkTransformer aims to democratize\nthe benefits of LLMs among those who may be less familiar with deep learning\nframeworks.\n","authors":["Abhishek Arora","Melissa Dell"],"pdf_url":"https://arxiv.org/pdf/2309.00789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00779v1","updated":"2023-09-02T01:24:59Z","published":"2023-09-02T01:24:59Z","title":"Value Kaleidoscope: Engaging AI with Pluralistic Human Values, Rights,\n and Duties","summary":" Human values are crucial to human decision-making. Value pluralism is the\nview that multiple correct values may be held in tension with one another\n(e.g., when considering lying to a friend to protect their feelings, how does\none balance honesty with friendship?). As statistical learners, AI systems fit\nto averages by default, washing out these potentially irreducible value\nconflicts. To improve AI systems to better reflect value pluralism, the\nfirst-order challenge is to explore the extent to which AI systems can model\npluralistic human values, rights, and duties as well as their interaction.\n We introduce ValuePrism, a large-scale dataset of 218k values, rights, and\nduties connected to 31k human-written situations. ValuePrism's contextualized\nvalues are generated by GPT-4 and deemed high-quality by human annotators 91%\nof the time. We conduct a large-scale study with annotators across diverse\nsocial and demographic backgrounds to try to understand whose values are\nrepresented.\n With ValuePrism, we build Kaleido, an open, light-weight, and structured\nlanguage-based multi-task model that generates, explains, and assesses the\nrelevance and valence (i.e., support or oppose) of human values, rights, and\nduties within a specific context. Humans prefer the sets of values output by\nour system over the teacher GPT-4, finding them more accurate and with broader\ncoverage. In addition, we demonstrate that Kaleido can help explain variability\nin human decision-making by outputting contrasting values. Finally, we show\nthat Kaleido's representations transfer to other philosophical frameworks and\ndatasets, confirming the benefit of an explicit, modular, and interpretable\napproach to value pluralism. We hope that our work will serve as a step to\nmaking more explicit the implicit values behind human decision-making and to\nsteering AI systems to make decisions that are more in accordance with them.\n","authors":["Taylor Sorensen","Liwei Jiang","Jena Hwang","Sydney Levine","Valentina Pyatkin","Peter West","Nouha Dziri","Ximing Lu","Kavel Rao","Chandra Bhagavatula","Maarten Sap","John Tasioulas","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2309.00779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00770v1","updated":"2023-09-02T00:32:55Z","published":"2023-09-02T00:32:55Z","title":"Bias and Fairness in Large Language Models: A Survey","summary":" Rapid advancements of large language models (LLMs) have enabled the\nprocessing, understanding, and generation of human-like text, with increasing\nintegration into systems that touch our social sphere. Despite this success,\nthese models can learn, perpetuate, and amplify harmful social biases. In this\npaper, we present a comprehensive survey of bias evaluation and mitigation\ntechniques for LLMs. We first consolidate, formalize, and expand notions of\nsocial bias and fairness in natural language processing, defining distinct\nfacets of harm and introducing several desiderata to operationalize fairness\nfor LLMs. We then unify the literature by proposing three intuitive taxonomies,\ntwo for bias evaluation, namely metrics and datasets, and one for mitigation.\nOur first taxonomy of metrics for bias evaluation disambiguates the\nrelationship between metrics and evaluation datasets, and organizes metrics by\nthe different levels at which they operate in a model: embeddings,\nprobabilities, and generated text. Our second taxonomy of datasets for bias\nevaluation categorizes datasets by their structure as counterfactual inputs or\nprompts, and identifies the targeted harms and social groups; we also release a\nconsolidation of publicly-available datasets for improved access. Our third\ntaxonomy of techniques for bias mitigation classifies methods by their\nintervention during pre-processing, in-training, intra-processing, and\npost-processing, with granular subcategories that elucidate research trends.\nFinally, we identify open problems and challenges for future work. Synthesizing\na wide range of recent research, we aim to provide a clear guide of the\nexisting literature that empowers researchers and practitioners to better\nunderstand and prevent the propagation of bias in LLMs.\n","authors":["Isabel O. Gallegos","Ryan A. Rossi","Joe Barrow","Md Mehrab Tanjim","Sungchul Kim","Franck Dernoncourt","Tong Yu","Ruiyi Zhang","Nesreen K. Ahmed"],"pdf_url":"https://arxiv.org/pdf/2309.00770v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.01032v1","updated":"2023-09-02T22:34:26Z","published":"2023-09-02T22:34:26Z","title":"Hessian-aware Quantized Node Embeddings for Recommendation","summary":" Graph Neural Networks (GNNs) have achieved state-of-the-art performance in\nrecommender systems. Nevertheless, the process of searching and ranking from a\nlarge item corpus usually requires high latency, which limits the widespread\ndeployment of GNNs in industry-scale applications. To address this issue, many\nmethods compress user/item representations into the binary embedding space to\nreduce space requirements and accelerate inference. Also, they use the\nStraight-through Estimator (STE) to prevent vanishing gradients during\nback-propagation. However, the STE often causes the gradient mismatch problem,\nleading to sub-optimal results.\n In this work, we present the Hessian-aware Quantized GNN (HQ-GNN) as an\neffective solution for discrete representations of users/items that enable fast\nretrieval. HQ-GNN is composed of two components: a GNN encoder for learning\ncontinuous node embeddings and a quantized module for compressing\nfull-precision embeddings into low-bit ones. Consequently, HQ-GNN benefits from\nboth lower memory requirements and faster inference speeds compared to vanilla\nGNNs. To address the gradient mismatch problem in STE, we further consider the\nquantized errors and its second-order derivatives for better stability. The\nexperimental results on several large-scale datasets show that HQ-GNN achieves\na good balance between latency and performance.\n","authors":["Huiyuan Chen","Kaixiong Zhou","Kwei-Herng Lai","Chin-Chia Michael Yeh","Yan Zheng","Xia Hu","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2309.01032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01026v1","updated":"2023-09-02T21:29:53Z","published":"2023-09-02T21:29:53Z","title":"Zero-Shot Recommendations with Pre-Trained Large Language Models for\n Multimodal Nudging","summary":" We present a method for zero-shot recommendation of multimodal non-stationary\ncontent that leverages recent advancements in the field of generative AI. We\npropose rendering inputs of different modalities as textual descriptions and to\nutilize pre-trained LLMs to obtain their numerical representations by computing\nsemantic embeddings. Once unified representations of all content items are\nobtained, the recommendation can be performed by computing an appropriate\nsimilarity metric between them without any additional learning. We demonstrate\nour approach on a synthetic multimodal nudging environment, where the inputs\nconsist of tabular, textual, and visual data.\n","authors":["Rachel Harrison","Anton Dereventsov","Anton Bibin"],"pdf_url":"https://arxiv.org/pdf/2309.01026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01015v1","updated":"2023-09-02T20:38:58Z","published":"2023-09-02T20:38:58Z","title":"MPTopic: Improving topic modeling via Masked Permuted pre-training","summary":" Topic modeling is pivotal in discerning hidden semantic structures within\ntexts, thereby generating meaningful descriptive keywords. While innovative\ntechniques like BERTopic and Top2Vec have recently emerged in the forefront,\nthey manifest certain limitations. Our analysis indicates that these methods\nmight not prioritize the refinement of their clustering mechanism, potentially\ncompromising the quality of derived topic clusters. To illustrate, Top2Vec\ndesignates the centroids of clustering results to represent topics, whereas\nBERTopic harnesses C-TF-IDF for its topic extraction.In response to these\nchallenges, we introduce \"TF-RDF\" (Term Frequency - Relative Document\nFrequency), a distinctive approach to assess the relevance of terms within a\ndocument. Building on the strengths of TF-RDF, we present MPTopic, a clustering\nalgorithm intrinsically driven by the insights of TF-RDF. Through comprehensive\nevaluation, it is evident that the topic keywords identified with the synergy\nof MPTopic and TF-RDF outperform those extracted by both BERTopic and Top2Vec.\n","authors":["Xinche Zhang","Evangelos milios"],"pdf_url":"https://arxiv.org/pdf/2309.01015v1.pdf","comment":"12 pages, will submit to ECIR 2024"},{"id":"http://arxiv.org/abs/2309.00976v1","updated":"2023-09-02T16:20:41Z","published":"2023-09-02T16:20:41Z","title":"Pure Message Passing Can Estimate Common Neighbor for Link Prediction","summary":" Message Passing Neural Networks (MPNNs) have emerged as the {\\em de facto}\nstandard in graph representation learning. However, when it comes to link\nprediction, they often struggle, surpassed by simple heuristics such as Common\nNeighbor (CN). This discrepancy stems from a fundamental limitation: while\nMPNNs excel in node-level representation, they stumble with encoding the joint\nstructural features essential to link prediction, like CN. To bridge this gap,\nwe posit that, by harnessing the orthogonality of input vectors, pure\nmessage-passing can indeed capture joint structural features. Specifically, we\nstudy the proficiency of MPNNs in approximating CN heuristics. Based on our\nfindings, we introduce the Message Passing Link Predictor (MPLP), a novel link\nprediction model. MPLP taps into quasi-orthogonal vectors to estimate\nlink-level structural features, all while preserving the node-level\ncomplexities. Moreover, our approach demonstrates that leveraging\nmessage-passing to capture structural features could offset MPNNs'\nexpressiveness limitations at the expense of estimation variance. We conduct\nexperiments on benchmark datasets from various domains, where our method\nconsistently outperforms the baseline methods.\n","authors":["Kaiwen Dong","Zhichun Guo","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2309.00976v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2307.11772v2","updated":"2023-09-02T14:18:40Z","published":"2023-07-18T04:43:24Z","title":"AutoAlign: Fully Automatic and Effective Knowledge Graph Alignment\n enabled by Large Language Models","summary":" The task of entity alignment between knowledge graphs (KGs) aims to identify\nevery pair of entities from two different KGs that represent the same entity.\nMany machine learning-based methods have been proposed for this task. However,\nto our best knowledge, existing methods all require manually crafted seed\nalignments, which are expensive to obtain. In this paper, we propose the first\nfully automatic alignment method named AutoAlign, which does not require any\nmanually crafted seed alignments. Specifically, for predicate embeddings,\nAutoAlign constructs a predicate-proximity-graph with the help of large\nlanguage models to automatically capture the similarity between predicates\nacross two KGs. For entity embeddings, AutoAlign first computes the entity\nembeddings of each KG independently using TransE, and then shifts the two KGs'\nentity embeddings into the same vector space by computing the similarity\nbetween entities based on their attributes. Thus, both predicate alignment and\nentity alignment can be done without manually crafted seed alignments.\nAutoAlign is not only fully automatic, but also highly effective. Experiments\nusing real-world KGs show that AutoAlign improves the performance of entity\nalignment significantly compared to state-of-the-art methods.\n","authors":["Rui Zhang","Yixin Su","Bayu Distiawan Trisedya","Xiaoyan Zhao","Min Yang","Hong Cheng","Jianzhong Qi"],"pdf_url":"https://arxiv.org/pdf/2307.11772v2.pdf","comment":"14 pages, 5 figures, 4 tables. arXiv admin note: substantial text\n overlap with arXiv:2210.08540"},{"id":"http://arxiv.org/abs/2309.00946v1","updated":"2023-09-02T13:52:53Z","published":"2023-09-02T13:52:53Z","title":"From Specific to Generic Learned Sorted Set Dictionaries: A\n Theoretically Sound Paradigm Yelding Competitive Data Structural Boosters in\n Practice","summary":" This research concerns Learned Data Structures, a recent area that has\nemerged at the crossroad of Machine Learning and Classic Data Structures. It is\nmethodologically important and with a high practical impact. We focus on\nLearned Indexes, i.e., Learned Sorted Set Dictionaries. The proposals available\nso far are specific in the sense that they can boost, indeed impressively, the\ntime performance of Table Search Procedures with a sorted layout only, e.g.,\nBinary Search. We propose a novel paradigm that, complementing known\nspecialized ones, can produce Learned versions of any Sorted Set Dictionary,\nfor instance, Balanced Binary Search Trees or Binary Search on layouts other\nthat sorted, i.e., Eytzinger. Theoretically, based on it, we obtain several\nresults of interest, such as (a) the first Learned Optimum Binary Search\nForest, with mean access time bounded by the Entropy of the probability\ndistribution of the accesses to the Dictionary; (b) the first Learned Sorted\nSet Dictionary that, in the Dynamic Case and in an amortized analysis setting,\nmatches the same time bounds known for Classic Dictionaries. This latter under\nwidely accepted assumptions regarding the size of the Universe. The\nexperimental part, somewhat complex in terms of software development, clearly\nindicates the nonobvious finding that the generalization we propose can yield\neffective and competitive Learned Data Structural Booster, even with respect to\nspecific benchmark models.\n","authors":["Domenico Amato","Giosué Lo Bosco","Raffaele Giancarlo"],"pdf_url":"https://arxiv.org/pdf/2309.00946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00940v1","updated":"2023-09-02T13:35:11Z","published":"2023-09-02T13:35:11Z","title":"Content Prompting: Modeling Content Provider Dynamics to Improve User\n Welfare in Recommender Ecosystems","summary":" Users derive value from a recommender system (RS) only to the extent that it\nis able to surface content (or items) that meet their needs/preferences. While\nRSs often have a comprehensive view of user preferences across the entire user\nbase, content providers, by contrast, generally have only a local view of the\npreferences of users that have interacted with their content. This limits a\nprovider's ability to offer new content to best serve the broader population.\nIn this work, we tackle this information asymmetry with content prompting\npolicies. A content prompt is a hint or suggestion to a provider to make\navailable novel content for which the RS predicts unmet user demand. A\nprompting policy is a sequence of such prompts that is responsive to the\ndynamics of a provider's beliefs, skills and incentives. We aim to determine a\njoint prompting policy that induces a set of providers to make content\navailable that optimizes user social welfare in equilibrium, while respecting\nthe incentives of the providers themselves. Our contributions include: (i) an\nabstract model of the RS ecosystem, including content provider behaviors, that\nsupports such prompting; (ii) the design and theoretical analysis of sequential\nprompting policies for individual providers; (iii) a mixed integer programming\nformulation for optimal joint prompting using path planning in content space;\nand (iv) simple, proof-of-concept experiments illustrating how such policies\nimprove ecosystem health and user welfare.\n","authors":["Siddharth Prasad","Martin Mladenov","Craig Boutilier"],"pdf_url":"https://arxiv.org/pdf/2309.00940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16761v2","updated":"2023-09-02T07:04:04Z","published":"2023-08-31T14:29:10Z","title":"Co-evolving Vector Quantization for ID-based Recommendation","summary":" Category information plays a crucial role in enhancing the quality and\npersonalization of recommendations. Nevertheless, the availability of item\ncategory information is not consistently present, particularly in the context\nof ID-based recommendations. In this work, we propose an alternative approach\nto automatically learn and generate entity (i.e., user and item) categorical\ninformation at different levels of granularity, specifically for ID-based\nrecommendation. Specifically, we devise a co-evolving vector quantization\nframework, namely COVE, which enables the simultaneous learning and refinement\nof code representation and entity embedding in an end-to-end manner, starting\nfrom the randomly initialized states. With its high adaptability, COVE can be\neasily integrated into existing recommendation models. We validate the\neffectiveness of COVE on various recommendation tasks including list\ncompletion, collaborative filtering, and click-through rate prediction, across\ndifferent recommendation models. We will publish the code and data for other\nresearchers to reproduce our work.\n","authors":["Qijiong Liu","Jiaren Xiao","Lu Fan","Jieming Zhu","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.16761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00841v1","updated":"2023-09-02T06:33:18Z","published":"2023-09-02T06:33:18Z","title":"LeanContext: Cost-Efficient Domain-Specific Question Answering Using\n LLMs","summary":" Question-answering (QA) is a significant application of Large Language Models\n(LLMs), shaping chatbot capabilities across healthcare, education, and customer\nservice. However, widespread LLM integration presents a challenge for small\nbusinesses due to the high expenses of LLM API usage. Costs rise rapidly when\ndomain-specific data (context) is used alongside queries for accurate\ndomain-specific LLM responses. One option is to summarize the context by using\nLLMs and reduce the context. However, this can also filter out useful\ninformation that is necessary to answer some domain-specific queries. In this\npaper, we shift from human-oriented summarizers to AI model-friendly summaries.\nOur approach, LeanContext, efficiently extracts $k$ key sentences from the\ncontext that are closely aligned with the query. The choice of $k$ is neither\nstatic nor random; we introduce a reinforcement learning technique that\ndynamically determines $k$ based on the query and context. The rest of the less\nimportant sentences are reduced using a free open source text reduction method.\nWe evaluate LeanContext against several recent query-aware and query-unaware\ncontext reduction approaches on prominent datasets (arxiv papers and BBC news\narticles). Despite cost reductions of $37.29\\%$ to $67.81\\%$, LeanContext's\nROUGE-1 score decreases only by $1.41\\%$ to $2.65\\%$ compared to a baseline\nthat retains the entire context (no summarization). Additionally, if free\npretrained LLM-based summarizers are used to reduce context (into human\nconsumable summaries), LeanContext can further modify the reduced context to\nenhance the accuracy (ROUGE-1 score) by $13.22\\%$ to $24.61\\%$.\n","authors":["Md Adnan Arefeen","Biplob Debnath","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.00841v1.pdf","comment":"The paper is under review"}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.01026v1","updated":"2023-09-02T21:29:53Z","published":"2023-09-02T21:29:53Z","title":"Zero-Shot Recommendations with Pre-Trained Large Language Models for\n Multimodal Nudging","summary":" We present a method for zero-shot recommendation of multimodal non-stationary\ncontent that leverages recent advancements in the field of generative AI. We\npropose rendering inputs of different modalities as textual descriptions and to\nutilize pre-trained LLMs to obtain their numerical representations by computing\nsemantic embeddings. Once unified representations of all content items are\nobtained, the recommendation can be performed by computing an appropriate\nsimilarity metric between them without any additional learning. We demonstrate\nour approach on a synthetic multimodal nudging environment, where the inputs\nconsist of tabular, textual, and visual data.\n","authors":["Rachel Harrison","Anton Dereventsov","Anton Bibin"],"pdf_url":"https://arxiv.org/pdf/2309.01026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09675v2","updated":"2023-09-02T18:05:13Z","published":"2023-06-16T08:13:41Z","title":"Multi-View Class Incremental Learning","summary":" Multi-view learning (MVL) has gained great success in integrating information\nfrom multiple perspectives of a dataset to improve downstream task performance.\nTo make MVL methods more practical in an open-ended environment, this paper\ninvestigates a novel paradigm called multi-view class incremental learning\n(MVCIL), where a single model incrementally classifies new classes from a\ncontinual stream of views, requiring no access to earlier views of data.\nHowever, MVCIL is challenged by the catastrophic forgetting of old information\nand the interference with learning new concepts. To address this, we first\ndevelop a randomization-based representation learning technique serving for\nfeature extraction to guarantee their separate view-optimal working states,\nduring which multiple views belonging to a class are presented sequentially;\nThen, we integrate them one by one in the orthogonality fusion subspace spanned\nby the extracted features; Finally, we introduce selective weight consolidation\nfor learning-without-forgetting decision-making while encountering new classes.\nExtensive experiments on synthetic and real-world datasets validate the\neffectiveness of our approach.\n","authors":["Depeng Li","Tianqi Wang","Junwei Chen","Kenji Kawaguchi","Cheng Lian","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2306.09675v2.pdf","comment":"22 pages,4 figures. Preprint submitted to Information Fusion"},{"id":"http://arxiv.org/abs/2302.05543v2","updated":"2023-09-02T11:39:28Z","published":"2023-02-10T23:12:37Z","title":"Adding Conditional Control to Text-to-Image Diffusion Models","summary":" We present ControlNet, a neural network architecture to add spatial\nconditioning controls to large, pretrained text-to-image diffusion models.\nControlNet locks the production-ready large diffusion models, and reuses their\ndeep and robust encoding layers pretrained with billions of images as a strong\nbackbone to learn a diverse set of conditional controls. The neural\narchitecture is connected with \"zero convolutions\" (zero-initialized\nconvolution layers) that progressively grow the parameters from zero and ensure\nthat no harmful noise could affect the finetuning. We test various conditioning\ncontrols, eg, edges, depth, segmentation, human pose, etc, with Stable\nDiffusion, using single or multiple conditions, with or without prompts. We\nshow that the training of ControlNets is robust with small (<50k) and large\n(>1m) datasets. Extensive results show that ControlNet may facilitate wider\napplications to control image diffusion models.\n","authors":["Lvmin Zhang","Anyi Rao","Maneesh Agrawala"],"pdf_url":"https://arxiv.org/pdf/2302.05543v2.pdf","comment":"Codes and Supplementary Material:\n https://github.com/lllyasviel/ControlNet"},{"id":"http://arxiv.org/abs/2208.00164v2","updated":"2023-09-02T08:04:06Z","published":"2022-07-30T08:19:29Z","title":"Distilled Low Rank Neural Radiance Field with Quantization for Light\n Field Compression","summary":" In this paper, we propose a novel light field compression method based on a\nQuantized Distilled Low Rank Neural Radiance Field (QDLR-NeRF) representation.\nWhile existing compression methods encode the set of light field sub-aperture\nimages, our proposed method instead learns an implicit scene representation in\nthe form of a Neural Radiance Field (NeRF), which also enables view synthesis.\nFor reducing its size, the model is first learned under a Low Rank (LR)\nconstraint using a Tensor Train (TT) decomposition in an Alternating Direction\nMethod of Multipliers (ADMM) optimization framework. To further reduce the\nmodel size, the components of the tensor train decomposition need to be\nquantized. However, performing the optimization of the NeRF model by\nsimultaneously taking the low rank constraint and the rate-constrained weight\nquantization into consideration is challenging. To deal with this difficulty,\nwe introduce a network distillation operation that separates the low rank\napproximation and the weight quantization in the network training. The\ninformation from the initial LR constrained NeRF (LR-NeRF) is distilled to a\nmodel of a much smaller dimension (DLR-NeRF) based on the TT decomposition of\nthe LR-NeRF. An optimized global codebook is then learned to quantize all TT\ncomponents, producing the final QDLRNeRF. Experimental results show that our\nproposed method yields better compression efficiency compared with\nstate-of-the-art methods, and it additionally has the advantage of allowing the\nsynthesis of any light field view with a high quality.\n","authors":["Jinglei Shi","Christine Guillemot"],"pdf_url":"https://arxiv.org/pdf/2208.00164v2.pdf","comment":"The explanation of this paper lacks many details and is not well\n organized, we withdraw it to avoid misleading readers"}]},"2023-09-06T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.03175v1","updated":"2023-09-06T17:24:06Z","published":"2023-09-06T17:24:06Z","title":"Gender-specific Machine Translation with Large Language Models","summary":" Decoder-only Large Language Models (LLMs) have demonstrated potential in\nmachine translation (MT), albeit with performance slightly lagging behind\ntraditional encoder-decoder Neural Machine Translation (NMT) systems. However,\nLLMs offer a unique advantage: the ability to control the properties of the\noutput through prompts. In this study, we harness this flexibility to explore\nLLaMa's capability to produce gender-specific translations for languages with\ngrammatical gender. Our results indicate that LLaMa can generate\ngender-specific translations with competitive accuracy and gender bias\nmitigation when compared to NLLB, a state-of-the-art multilingual NMT system.\nFurthermore, our experiments reveal that LLaMa's translations are robust,\nshowing significant performance drops when evaluated against opposite-gender\nreferences in gender-ambiguous datasets but maintaining consistency in less\nambiguous contexts. This research provides insights into the potential and\nchallenges of using LLMs for gender-specific translations and highlights the\nimportance of in-context learning to elicit new tasks in LLMs.\n","authors":["Eduardo Sánchez","Pierre Andrews","Pontus Stenetorp","Mikel Artetxe","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2309.03175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03079v1","updated":"2023-09-06T17:18:55Z","published":"2023-09-06T17:18:55Z","title":"GPT-InvestAR: Enhancing Stock Investment Strategies through Annual\n Report Analysis with Large Language Models","summary":" Annual Reports of publicly listed companies contain vital information about\ntheir financial health which can help assess the potential impact on Stock\nprice of the firm. These reports are comprehensive in nature, going up to, and\nsometimes exceeding, 100 pages. Analysing these reports is cumbersome even for\na single firm, let alone the whole universe of firms that exist. Over the\nyears, financial experts have become proficient in extracting valuable\ninformation from these documents relatively quickly. However, this requires\nyears of practice and experience. This paper aims to simplify the process of\nassessing Annual Reports of all the firms by leveraging the capabilities of\nLarge Language Models (LLMs). The insights generated by the LLM are compiled in\na Quant styled dataset and augmented by historical stock price data. A Machine\nLearning model is then trained with LLM outputs as features. The walkforward\ntest results show promising outperformance wrt S&P500 returns. This paper\nintends to provide a framework for future work in this direction. To facilitate\nthis, the code has been released as open source.\n","authors":["Udit Gupta"],"pdf_url":"https://arxiv.org/pdf/2309.03079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03164v1","updated":"2023-09-06T17:06:31Z","published":"2023-09-06T17:06:31Z","title":"J-Guard: Journalism Guided Adversarially Robust Detection of\n AI-generated News","summary":" The rapid proliferation of AI-generated text online is profoundly reshaping\nthe information landscape. Among various types of AI-generated text,\nAI-generated news presents a significant threat as it can be a prominent source\nof misinformation online. While several recent efforts have focused on\ndetecting AI-generated text in general, these methods require enhanced\nreliability, given concerns about their vulnerability to simple adversarial\nattacks. Furthermore, due to the eccentricities of news writing, applying these\ndetection methods for AI-generated news can produce false positives,\npotentially damaging the reputation of news organizations. To address these\nchallenges, we leverage the expertise of an interdisciplinary team to develop a\nframework, J-Guard, capable of steering existing supervised AI text detectors\nfor detecting AI-generated news while boosting adversarial robustness. By\nincorporating stylistic cues inspired by the unique journalistic attributes,\nJ-Guard effectively distinguishes between real-world journalism and\nAI-generated news articles. Our experiments on news articles generated by a\nvast array of AI models, including ChatGPT (GPT3.5), demonstrate the\neffectiveness of J-Guard in enhancing detection capabilities while maintaining\nan average performance decrease of as low as 7% when faced with adversarial\nattacks.\n","authors":["Tharindu Kumarage","Amrita Bhattacharjee","Djordje Padejski","Kristy Roschke","Dan Gillmor","Scott Ruston","Huan Liu","Joshua Garland"],"pdf_url":"https://arxiv.org/pdf/2309.03164v1.pdf","comment":"This Paper is Accepted to The 13th International Joint Conference on\n Natural Language Processing and the 3rd Conference of the Asia-Pacific\n Chapter of the Association for Computational Linguistics (IJCNLP-AACL 2023)"},{"id":"http://arxiv.org/abs/2309.03126v1","updated":"2023-09-06T16:03:59Z","published":"2023-09-06T16:03:59Z","title":"Everyone Deserves A Reward: Learning Customized Human Preferences","summary":" Reward models (RMs) are crucial in aligning large language models (LLMs) with\nhuman preferences for improving interaction quality. However, the real world is\npluralistic, which leads to diversified human preferences based on different\nreligions, politics, cultures, etc. Moreover, each individual can have their\nown unique preferences on various topics. Neglecting the diversity of human\npreferences, current LLM training processes only use a general reward model,\nwhich is below satisfaction for customized or personalized application\nscenarios. To explore customized preference learning, we collect a\ndomain-specific preference (DSP) dataset, which collects preferred responses to\neach given query from four practical domains. Besides, from the perspective of\ndata efficiency, we proposed a three-stage customized RM learning scheme, whose\neffectiveness is empirically verified on both general preference datasets and\nour DSP set. Furthermore, we test multiple training and data strategies on the\nthree learning stages, and have found several ways to better preserve the\ngeneral preferring ability while training the customized RMs, especially\ngeneral preference enrichment and customized preference imitation learning. The\nDSP dataset and code are available at https://github.com/Linear95/DSP.\n","authors":["Pengyu Cheng","Jiawen Xie","Ke Bai","Yong Dai","Nan Du"],"pdf_url":"https://arxiv.org/pdf/2309.03126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03118v1","updated":"2023-09-06T15:55:01Z","published":"2023-09-06T15:55:01Z","title":"Knowledge Solver: Teaching LLMs to Search for Domain Knowledge from\n Knowledge Graphs","summary":" Large language models (LLMs), such as ChatGPT and GPT-4, are versatile and\ncan solve different tasks due to their emergent ability and generalizability.\nHowever, LLMs sometimes lack domain-specific knowledge to perform tasks, which\nwould also cause hallucination during inference. In some previous works,\nadditional modules like graph neural networks (GNNs) are trained on retrieved\nknowledge from external knowledge bases, aiming to mitigate the problem of\nlacking domain-specific knowledge. However, incorporating additional modules:\n1) would need retraining additional modules when encountering novel domains; 2)\nwould become a bottleneck since LLMs' strong abilities are not fully utilized\nfor retrieval. In this paper, we propose a paradigm, termed Knowledge Solver\n(KSL), to teach LLMs to search for essential knowledge from external knowledge\nbases by harnessing their own strong generalizability. Specifically, we design\na simple yet effective prompt to transform retrieval into a multi-hop decision\nsequence, which empowers LLMs with searching knowledge ability in zero-shot\nmanner. Additionally, KSL is able to provide complete retrieval paths and\ntherefore increase explainability of LLMs' reasoning processes. We conduct\nexperiments on three datasets: CommonsenseQA, OpenbookQA, and MedQA-USMLE, and\nfound that our approach improves LLM baseline performance by a relatively large\nmargin.\n","authors":["Chao Feng","Xinyu Zhang","Zichu Fei"],"pdf_url":"https://arxiv.org/pdf/2309.03118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03103v1","updated":"2023-09-06T15:41:38Z","published":"2023-09-06T15:41:38Z","title":"ContrastWSD: Enhancing Metaphor Detection with Word Sense Disambiguation\n Following the Metaphor Identification Procedure","summary":" This paper presents ContrastWSD, a RoBERTa-based metaphor detection model\nthat integrates the Metaphor Identification Procedure (MIP) and Word Sense\nDisambiguation (WSD) to extract and contrast the contextual meaning with the\nbasic meaning of a word to determine whether it is used metaphorically in a\nsentence. By utilizing the word senses derived from a WSD model, our model\nenhances the metaphor detection process and outperforms other methods that rely\nsolely on contextual embeddings or integrate only the basic definitions and\nother external knowledge. We evaluate our approach on various benchmark\ndatasets and compare it with strong baselines, indicating the effectiveness in\nadvancing metaphor detection.\n","authors":["Mohamad Elzohbi","Richard Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.03103v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.01940v2","updated":"2023-09-06T15:36:11Z","published":"2023-09-05T04:12:01Z","title":"CodeApex: A Bilingual Programming Evaluation Benchmark for Large\n Language Models","summary":" With the emergence of Large Language Models (LLMs), there has been a\nsignificant improvement in the programming capabilities of models, attracting\ngrowing attention from researchers. We propose CodeApex, a bilingual benchmark\ndataset focusing on the programming comprehension and code generation abilities\nof LLMs. CodeApex comprises three types of multiple-choice questions:\nconceptual understanding, commonsense reasoning, and multi-hop reasoning,\ndesigned to evaluate LLMs on programming comprehension tasks. Additionally,\nCodeApex utilizes algorithmic questions and corresponding test cases to assess\nthe code quality generated by LLMs. We evaluate 14 state-of-the-art LLMs,\nincluding both general-purpose and specialized models. GPT exhibits the best\nprogramming capabilities, achieving approximate accuracies of 50% and 56% on\nthe two tasks, respectively. There is still significant room for improvement in\nprogramming tasks. We hope that CodeApex can serve as a reference for\nevaluating the coding capabilities of LLMs, further promoting their development\nand growth. Datasets are released at https://github.com/APEXLAB/CodeApex.git.\nCodeApex submission website is https://apex.sjtu.edu.cn/codeapex/.\n","authors":["Lingyue Fu","Huacan Chai","Shuang Luo","Kounianhua Du","Weiming Zhang","Longteng Fan","Jiayi Lei","Renting Rui","Jianghao Lin","Yuchen Fang","Yifan Liu","Jingkuan Wang","Siyuan Qi","Kangning Zhang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2309.01940v2.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2309.03064v1","updated":"2023-09-06T15:07:23Z","published":"2023-09-06T15:07:23Z","title":"A Multimodal Analysis of Influencer Content on Twitter","summary":" Influencer marketing involves a wide range of strategies in which brands\ncollaborate with popular content creators (i.e., influencers) to leverage their\nreach, trust, and impact on their audience to promote and endorse products or\nservices. Because followers of influencers are more likely to buy a product\nafter receiving an authentic product endorsement rather than an explicit direct\nproduct promotion, the line between personal opinions and commercial content\npromotion is frequently blurred. This makes automatic detection of regulatory\ncompliance breaches related to influencer advertising (e.g., misleading\nadvertising or hidden sponsorships) particularly difficult. In this work, we\n(1) introduce a new Twitter (now X) dataset consisting of 15,998 influencer\nposts mapped into commercial and non-commercial categories for assisting in the\nautomatic detection of commercial influencer content; (2) experiment with an\nextensive set of predictive models that combine text and visual information\nshowing that our proposed cross-attention approach outperforms state-of-the-art\nmultimodal models; and (3) conduct a thorough analysis of strengths and\nlimitations of our models. We show that multimodal modeling is useful for\nidentifying commercial posts, reducing the amount of false positives, and\ncapturing relevant context that aids in the discovery of undisclosed commercial\nposts.\n","authors":["Danae Sánchez Villegas","Catalina Goanta","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2309.03064v1.pdf","comment":"Accepted at AACL 2023"},{"id":"http://arxiv.org/abs/2308.04566v4","updated":"2023-09-06T14:29:54Z","published":"2023-08-08T20:29:13Z","title":"Single-Sentence Reader: A Novel Approach for Addressing Answer Position\n Bias","summary":" Machine Reading Comprehension (MRC) models tend to take advantage of spurious\ncorrelations (also known as dataset bias or annotation artifacts in the\nresearch community). Consequently, these models may perform the MRC task\nwithout fully comprehending the given context and question, which is\nundesirable since it may result in low robustness against distribution shift.\nThe main focus of this paper is answer-position bias, where a significant\npercentage of training questions have answers located solely in the first\nsentence of the context. We propose a Single-Sentence Reader as a new approach\nfor addressing answer position bias in MRC. Remarkably, in our experiments with\nsix different models, our proposed Single-Sentence Readers trained on biased\ndataset achieve results that nearly match those of models trained on normal\ndataset, proving their effectiveness in addressing the answer position bias.\nOur study also discusses several challenges our Single-Sentence Readers\nencounter and proposes a potential solution.\n","authors":["Son Quoc Tran","Matt Kretchmar"],"pdf_url":"https://arxiv.org/pdf/2308.04566v4.pdf","comment":"10 pages, 5 tables, 2 figures"},{"id":"http://arxiv.org/abs/2309.00424v2","updated":"2023-09-06T14:27:40Z","published":"2023-09-01T12:35:43Z","title":"Learning Speech Representation From Contrastive Token-Acoustic\n Pretraining","summary":" For fine-grained generation and recognition tasks such as\nminimally-supervised text-to-speech (TTS), voice conversion (VC), and automatic\nspeech recognition (ASR), the intermediate representations extracted from\nspeech should serve as a \"bridge\" between text and acoustic information,\ncontaining information from both modalities. The semantic content is\nemphasized, while the paralinguistic information such as speaker identity and\nacoustic details should be de-emphasized. However, existing methods for\nextracting fine-grained intermediate representations from speech suffer from\nissues of excessive redundancy and dimension explosion. Contrastive learning is\na good method for modeling intermediate representations from two modalities.\nHowever, existing contrastive learning methods in the audio field focus on\nextracting global descriptive information for downstream audio classification\ntasks, making them unsuitable for TTS, VC, and ASR tasks. To address these\nissues, we propose a method named \"Contrastive Token-Acoustic Pretraining\n(CTAP)\", which uses two encoders to bring phoneme and speech into a joint\nmultimodal space, learning how to connect phoneme and speech at the frame\nlevel. The CTAP model is trained on 210k speech and phoneme text pairs,\nachieving minimally-supervised TTS, VC, and ASR. The proposed CTAP method\noffers a promising solution for fine-grained generation and recognition\ndownstream tasks in speech processing.\n","authors":["Chunyu Qiang","Hao Li","Yixin Tian","Ruibo Fu","Tao Wang","Longbiao Wang","Jianwu Dang"],"pdf_url":"https://arxiv.org/pdf/2309.00424v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01860v2","updated":"2023-09-06T13:06:45Z","published":"2023-09-04T23:31:29Z","title":"Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition\n and Translation","summary":" In this paper, we devise a mechanism for the addition of multi-modal\ninformation with an existing pipeline for continuous sign language recognition\nand translation. In our procedure, we have incorporated optical flow\ninformation with RGB images to enrich the features with movement-related\ninformation. This work studies the feasibility of such modality inclusion using\na cross-modal encoder. The plugin we have used is very lightweight and doesn't\nneed to include a separate feature extractor for the new modality in an\nend-to-end manner. We have applied the changes in both sign language\nrecognition and translation, improving the result in each case. We have\nevaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language\nrecognition and the RWTH-PHOENIX-2014T dataset for translation. On the\nrecognition task, our approach reduced the WER by 0.9, and on the translation\ntask, our approach increased most of the BLEU scores by ~0.6 on the test set.\n","authors":["Zaber Ibn Abdul Hakim","Rasman Mubtasim Swargo","Muhammad Abdullah Adnan"],"pdf_url":"https://arxiv.org/pdf/2309.01860v2.pdf","comment":"This version has some errors. Our schedule is packed, so we don't\n have enough time to correct it. We will share another work when we have time\n to fix this"},{"id":"http://arxiv.org/abs/2309.02915v1","updated":"2023-09-06T11:20:41Z","published":"2023-09-06T11:20:41Z","title":"Persona-aware Generative Model for Code-mixed Language","summary":" Code-mixing and script-mixing are prevalent across online social networks and\nmultilingual societies. However, a user's preference toward code-mixing depends\non the socioeconomic status, demographics of the user, and the local context,\nwhich existing generative models mostly ignore while generating code-mixed\ntexts. In this work, we make a pioneering attempt to develop a persona-aware\ngenerative model to generate texts resembling real-life code-mixed texts of\nindividuals. We propose a Persona-aware Generative Model for Code-mixed\nGeneration, PARADOX, a novel Transformer-based encoder-decoder model that\nencodes an utterance conditioned on a user's persona and generates code-mixed\ntexts without monolingual reference data. We propose an alignment module that\nre-calibrates the generated sequence to resemble real-life code-mixed texts.\nPARADOX generates code-mixed texts that are semantically more meaningful and\nlinguistically more valid. To evaluate the personification capabilities of\nPARADOX, we propose four new metrics -- CM BLEU, CM Rouge-1, CM Rouge-L and CM\nKS. On average, PARADOX achieves 1.6 points better CM BLEU, 47% better\nperplexity and 32% better semantic coherence than the non-persona-based\ncounterparts.\n","authors":["Ayan Sengupta","Md Shad Akhtar","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2309.02915v1.pdf","comment":"4 tables, 4 figures"},{"id":"http://arxiv.org/abs/2309.02914v1","updated":"2023-09-06T11:20:02Z","published":"2023-09-06T11:20:02Z","title":"Leave no Place Behind: Improved Geolocation in Humanitarian Documents","summary":" Geographical location is a crucial element of humanitarian response,\noutlining vulnerable populations, ongoing events, and available resources.\nLatest developments in Natural Language Processing may help in extracting vital\ninformation from the deluge of reports and documents produced by the\nhumanitarian sector. However, the performance and biases of existing\nstate-of-the-art information extraction tools are unknown. In this work, we\ndevelop annotated resources to fine-tune the popular Named Entity Recognition\n(NER) tools Spacy and roBERTa to perform geotagging of humanitarian texts. We\nthen propose a geocoding method FeatureRank which links the candidate locations\nto the GeoNames database. We find that not only does the humanitarian-domain\ndata improves the performance of the classifiers (up to F1 = 0.92), but it also\nalleviates some of the bias of the existing tools, which erroneously favor\nlocations in the Western countries. Thus, we conclude that more resources from\nnon-Western documents are necessary to ensure that off-the-shelf NER systems\nare suitable for the deployment in the humanitarian sector.\n","authors":["Enrico M. Belliardo","Kyriaki Kalimeri","Yelena Mejova"],"pdf_url":"https://arxiv.org/pdf/2309.02914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02912v1","updated":"2023-09-06T11:15:47Z","published":"2023-09-06T11:15:47Z","title":"On the Challenges of Building Datasets for Hate Speech Detection","summary":" Detection of hate speech has been formulated as a standalone application of\nNLP and different approaches have been adopted for identifying the target\ngroups, obtaining raw data, defining the labeling process, choosing the\ndetection algorithm, and evaluating the performance in the desired setting.\nHowever, unlike other downstream tasks, hate speech suffers from the lack of\nlarge-sized, carefully curated, generalizable datasets owing to the highly\nsubjective nature of the task. In this paper, we first analyze the issues\nsurrounding hate speech detection through a data-centric lens. We then outline\na holistic framework to encapsulate the data creation pipeline across seven\nbroad dimensions by taking the specific example of hate speech towards sexual\nminorities. We posit that practitioners would benefit from following this\nframework as a form of best practice when creating hate speech datasets in the\nfuture.\n","authors":["Vitthal Bhandari"],"pdf_url":"https://arxiv.org/pdf/2309.02912v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.02902v1","updated":"2023-09-06T10:51:34Z","published":"2023-09-06T10:51:34Z","title":"ViCGCN: Graph Convolutional Network with Contextualized Language Models\n for Social Media Mining in Vietnamese","summary":" Social media processing is a fundamental task in natural language processing\nwith numerous applications. As Vietnamese social media and information science\nhave grown rapidly, the necessity of information-based mining on Vietnamese\nsocial media has become crucial. However, state-of-the-art research faces\nseveral significant drawbacks, including imbalanced data and noisy data on\nsocial media platforms. Imbalanced and noisy are two essential issues that need\nto be addressed in Vietnamese social media texts. Graph Convolutional Networks\ncan address the problems of imbalanced and noisy data in text classification on\nsocial media by taking advantage of the graph structure of the data. This study\npresents a novel approach based on contextualized language model (PhoBERT) and\ngraph-based method (Graph Convolutional Networks). In particular, the proposed\napproach, ViCGCN, jointly trained the power of Contextualized embeddings with\nthe ability of Graph Convolutional Networks, GCN, to capture more syntactic and\nsemantic dependencies to address those drawbacks. Extensive experiments on\nvarious Vietnamese benchmark datasets were conducted to verify our approach.\nThe observation shows that applying GCN to BERTology models as the final layer\nsignificantly improves performance. Moreover, the experiments demonstrate that\nViCGCN outperforms 13 powerful baseline models, including BERTology models,\nfusion BERTology and GCN models, other baselines, and SOTA on three benchmark\nsocial media datasets. Our proposed ViCGCN approach demonstrates a significant\nimprovement of up to 6.21%, 4.61%, and 2.63% over the best Contextualized\nLanguage Models, including multilingual and monolingual, on three benchmark\ndatasets, UIT-VSMEC, UIT-ViCTSD, and UIT-VSFC, respectively. Additionally, our\nintegrated model ViCGCN achieves the best performance compared to other\nBERTology integrated with GCN models.\n","authors":["Chau-Thang Phan","Quoc-Nam Nguyen","Chi-Thanh Dang","Trong-Hop Do","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2309.02902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02887v1","updated":"2023-09-06T10:20:59Z","published":"2023-09-06T10:20:59Z","title":"A deep Natural Language Inference predictor without language-specific\n training data","summary":" In this paper we present a technique of NLP to tackle the problem of\ninference relation (NLI) between pairs of sentences in a target language of\nchoice without a language-specific training dataset. We exploit a generic\ntranslation dataset, manually translated, along with two instances of the same\npre-trained model - the first to generate sentence embeddings for the source\nlanguage, and the second fine-tuned over the target language to mimic the\nfirst. This technique is known as Knowledge Distillation. The model has been\nevaluated over machine translated Stanford NLI test dataset, machine translated\nMulti-Genre NLI test dataset, and manually translated RTE3-ITA test dataset. We\nalso test the proposed architecture over different tasks to empirically\ndemonstrate the generality of the NLI task. The model has been evaluated over\nthe native Italian ABSITA dataset, on the tasks of Sentiment Analysis,\nAspect-Based Sentiment Analysis, and Topic Recognition. We emphasise the\ngenerality and exploitability of the Knowledge Distillation technique that\noutperforms other methodologies based on machine translation, even though the\nformer was not directly trained on the data it was tested over.\n","authors":["Lorenzo Corradi","Alessandro Manenti","Francesca Del Bonifro","Francesco Setti","Dario Del Sorbo"],"pdf_url":"https://arxiv.org/pdf/2309.02887v1.pdf","comment":"Conference: ICIAP2023"},{"id":"http://arxiv.org/abs/2309.02884v1","updated":"2023-09-06T10:20:06Z","published":"2023-09-06T10:20:06Z","title":"Aligning Large Language Models for Clinical Tasks","summary":" Large Language Models (LLMs) have demonstrated remarkable adaptability,\nshowcasing their capacity to excel in tasks for which they were not explicitly\ntrained. However, despite their impressive natural language processing (NLP)\ncapabilities, effective alignment of LLMs remains a crucial challenge when\ndeploying them for specific clinical applications. The ability to generate\nresponses with factually accurate content and to engage in non-trivial\nreasoning steps are crucial for the LLMs to be eligible for applications in\nclinical medicine. Employing a combination of techniques including\ninstruction-tuning and in-prompt strategies like few-shot and chain of thought\nprompting has significantly enhanced the performance of LLMs. Our proposed\nalignment strategy for medical question-answering, known as\n'expand-guess-refine', offers a parameter and data-efficient solution. A\npreliminary analysis of this method demonstrated outstanding performance,\nachieving a score of 70.63% on a subset of questions sourced from the USMLE\ndataset.\n","authors":["Supun Manathunga","Isuru Hettigoda"],"pdf_url":"https://arxiv.org/pdf/2309.02884v1.pdf","comment":"10 papers, 3 figures"},{"id":"http://arxiv.org/abs/2309.01740v2","updated":"2023-09-06T09:34:53Z","published":"2023-09-04T17:58:01Z","title":"An Empirical Analysis for Zero-Shot Multi-Label Classification on\n COVID-19 CT Scans and Uncurated Reports","summary":" The pandemic resulted in vast repositories of unstructured data, including\nradiology reports, due to increased medical examinations. Previous research on\nautomated diagnosis of COVID-19 primarily focuses on X-ray images, despite\ntheir lower precision compared to computed tomography (CT) scans. In this work,\nwe leverage unstructured data from a hospital and harness the fine-grained\ndetails offered by CT scans to perform zero-shot multi-label classification\nbased on contrastive visual language learning. In collaboration with human\nexperts, we investigate the effectiveness of multiple zero-shot models that aid\nradiologists in detecting pulmonary embolisms and identifying intricate lung\ndetails like ground glass opacities and consolidations. Our empirical analysis\nprovides an overview of the possible solutions to target such fine-grained\ntasks, so far overlooked in the medical multimodal pretraining literature. Our\ninvestigation promises future advancements in the medical image analysis\ncommunity by addressing some challenges associated with unstructured data and\nfine-grained multi-label classification.\n","authors":["Ethan Dack","Lorenzo Brigato","Matthew McMurray","Matthias Fontanellaz","Thomas Frauenfelder","Hanno Hoppe","Aristomenis Exadaktylos","Thomas Geiser","Manuela Funke-Chambour","Andreas Christe","Lukas Ebner","Stavroula Mougiakakou"],"pdf_url":"https://arxiv.org/pdf/2309.01740v2.pdf","comment":"Proceedings of the IEEE/CVF International Conference on Computer\n Vision (ICCV) Workshops 2023"},{"id":"http://arxiv.org/abs/2309.02823v1","updated":"2023-09-06T08:11:39Z","published":"2023-09-06T08:11:39Z","title":"Promoting Open-domain Dialogue Generation through Learning Pattern\n Information between Contexts and Responses","summary":" Recently, utilizing deep neural networks to build the opendomain dialogue\nmodels has become a hot topic. However, the responses generated by these models\nsuffer from many problems such as responses not being contextualized and tend\nto generate generic responses that lack information content, damaging the\nuser's experience seriously. Therefore, many studies try introducing more\ninformation into the dialogue models to make the generated responses more vivid\nand informative. Unlike them, this paper improves the quality of generated\nresponses by learning the implicit pattern information between contexts and\nresponses in the training samples. In this paper, we first build an open-domain\ndialogue model based on the pre-trained language model (i.e., GPT-2). And then,\nan improved scheduled sampling method is proposed for pre-trained models, by\nwhich the responses can be used to guide the response generation in the\ntraining phase while avoiding the exposure bias problem. More importantly, we\ndesign a response-aware mechanism for mining the implicit pattern information\nbetween contexts and responses so that the generated replies are more diverse\nand approximate to human replies. Finally, we evaluate the proposed model (RAD)\non the Persona-Chat and DailyDialog datasets; and the experimental results show\nthat our model outperforms the baselines on most automatic and manual metrics.\n","authors":["Mengjuan Liu","Chenyang Liu","Yunfan Yang","Jiang Liu","Mohan Jing"],"pdf_url":"https://arxiv.org/pdf/2309.02823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02812v1","updated":"2023-09-06T08:00:17Z","published":"2023-09-06T08:00:17Z","title":"Agent-based simulation of pedestrians' earthquake evacuation;\n application to Beirut, Lebanon","summary":" Most seismic risk assessment methods focus on estimating the damages to the\nbuilt environment and the consequent socioeconomic losses without fully taking\ninto account the social aspect of risk. Yet, human behaviour is a key element\nin predicting the human impact of an earthquake, therefore, it is important to\ninclude it in quantitative risk assessment studies. In this study, an\ninterdisciplinary approach simulating pedestrians' evacuation during\nearthquakes at the city scale is developed using an agent-based model. The\nmodel integrates the seismic hazard, the physical vulnerability as well as\nindividuals' behaviours and mobility. The simulator is applied to the case of\nBeirut, Lebanon. Lebanon is at the heart of the Levant fault system that has\ngenerated several Mw>7 earthquakes, the latest being in 1759. It is one of the\ncountries with the highest seismic risk in the Mediterranean region. This is\ndue to the high seismic vulnerability of the buildings due to the absence of\nmandatory seismic regulation until 2012, the high level of urbanization, and\nthe lack of adequate spatial planning and risk prevention policies. Beirut as\nthe main residential, economic and institutional hub of Lebanon is densely\npopulated. To accommodate the growing need for urban development, constructions\nhave almost taken over all of the green areas of the city; squares and gardens\nare disappearing to give place to skyscrapers. However, open spaces are safe\nplaces to shelter, away from debris, and therefore play an essential role in\nearthquake evacuation. Despite the massive urbanization, there are a few open\nspaces but locked gates and other types of anthropogenic barriers often limit\ntheir access. To simulate this complex context, pedestrians' evacuation\nsimulations are run in a highly realistic spatial environment implemented in\nGAMA [1]. Previous data concerning soil and buildings in Beirut [2, 3] are\ncomplemented by new geographic data extracted from high-resolution Pleiades\nsatellite images. The seismic loading is defined as a peak ground acceleration\nof 0.3g, as stated in Lebanese seismic regulations. Building damages are\nestimated using an artificial neural network trained to predict the mean damage\n[4] based on the seismic loading as well as the soil and building vibrational\nproperties [5]. Moreover, the quantity and the footprint of the generated\ndebris around each building are also estimated and included in the model. We\nsimulate how topography, buildings, debris, and access to open spaces, affect\nindividuals' mobility. Two city configurations are implemented: 1. Open spaces\nare accessible without any barriers; 2. Access to some open spaces is blocked.\nThe first simulation results show that while 52% of the population is able to\narrive to an open space within 5 minutes after an earthquake, this number is\nreduced to 39% when one of the open spaces is locked. These results show that\nthe presence of accessible open spaces in a city and their proximity to the\nresidential buildings is a crucial factor for ensuring people's safety when an\nearthquake occurs.\n","authors":["Rouba Iskandar","Kamel Allaw","Julie Dugdale","Elise Beck","Jocelyne Adjizian-Gérard","Cécile Cornou","Jacques Harb","Pascal Lacroix","Nada Badaro-Saliba","Stéphane Cartier","Rita Zaarour"],"pdf_url":"https://arxiv.org/pdf/2309.02812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02784v1","updated":"2023-09-06T06:51:15Z","published":"2023-09-06T06:51:15Z","title":"Norm Tweaking: High-performance Low-bit Quantization of Large Language\n Models","summary":" As the size of large language models (LLMs) continues to grow, model\ncompression without sacrificing accuracy has become a crucial challenge for\ndeployment. While some quantization methods, such as GPTQ, have made progress\nin achieving acceptable 4-bit weight-only quantization, attempts at lower bit\nquantization often result in severe performance degradation. In this paper, we\nintroduce a technique called norm tweaking, which can be used as a plugin in\ncurrent PTQ methods to achieve high precision while being cost-efficient. Our\napproach is inspired by the observation that rectifying the quantized\nactivation distribution to match its float counterpart can readily restore\naccuracy for LLMs. To achieve this, we carefully design a tweaking strategy\nthat includes calibration data generation and channel-wise distance constraint\nto update the weights of normalization layers for better generalization. We\nconduct extensive experiments on various datasets using several open-sourced\nLLMs. Our method demonstrates significant improvements in both weight-only\nquantization and joint quantization of weights and activations, surpassing\nexisting PTQ methods. On GLM-130B and OPT-66B, our method even achieves the\nsame level of accuracy at 2-bit quantization as their float ones. Our simple\nand effective approach makes it more practical for real-world applications.\n","authors":["Liang Li","Qingyuan Li","Bo Zhang","Xiangxiang Chu"],"pdf_url":"https://arxiv.org/pdf/2309.02784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02780v1","updated":"2023-09-06T06:44:26Z","published":"2023-09-06T06:44:26Z","title":"GRASS: Unified Generation Model for Speech Semantic Understanding","summary":" This paper explores the instruction fine-tuning technique for speech semantic\nunderstanding by introducing a unified end-to-end (E2E) framework that\ngenerates semantic labels conditioned on a task-related prompt for audio data.\nWe pre-train the model using large and diverse data, where instruction-speech\npairs are constructed via a text-to-speech (TTS) system. Extensive experiments\ndemonstrate that our proposed model significantly outperforms state-of-the-art\n(SOTA) models after fine-tuning downstream tasks. Furthermore, the proposed\nmodel achieves competitive performance in zero-shot and few-shot scenarios. To\nfacilitate future work on instruction fine-tuning for speech-to-semantic tasks,\nwe release our instruction dataset and code.\n","authors":["Aobo Xia","Shuyu Lei","Yushu Yang","Xiang Guo","Hua Chai"],"pdf_url":"https://arxiv.org/pdf/2309.02780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02772v1","updated":"2023-09-06T06:27:33Z","published":"2023-09-06T06:27:33Z","title":"Improving Code Generation by Dynamic Temperature Sampling","summary":" Recently, Large Language Models (LLMs) have shown impressive results in code\ngeneration. However, existing decoding strategies are designed for Natural\nLanguage (NL) generation, overlooking the differences between NL and\nprogramming languages (PL). Due to this oversight, a better decoding strategy\nfor code generation remains an open question. In this paper, we conduct the\nfirst systematic study to explore a decoding strategy specialized in code\ngeneration. With an analysis of loss distributions of code tokens, we find that\ncode tokens can be divided into two categories: challenging tokens that are\ndifficult to predict and confident tokens that can be easily inferred. Among\nthem, the challenging tokens mainly appear at the beginning of a code block.\nInspired by the above findings, we propose a simple yet effective method:\nAdaptive Temperature (AdapT) sampling, which dynamically adjusts the\ntemperature coefficient when decoding different tokens. We apply a larger\ntemperature when sampling for challenging tokens, allowing LLMs to explore\ndiverse choices. We employ a smaller temperature for confident tokens avoiding\nthe influence of tail randomness noises. We apply AdapT sampling to LLMs with\ndifferent sizes and conduct evaluations on two popular datasets. Results show\nthat AdapT sampling significantly outperforms state-of-the-art decoding\nstrategy.\n","authors":["Yuqi Zhu","Jia Allen Li","Ge Li","YunFei Zhao","Jia Li","Zhi Jin","Hong Mei"],"pdf_url":"https://arxiv.org/pdf/2309.02772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02740v1","updated":"2023-09-06T05:51:19Z","published":"2023-09-06T05:51:19Z","title":"Rubric-Specific Approach to Automated Essay Scoring with Augmentation\n Training","summary":" Neural based approaches to automatic evaluation of subjective responses have\nshown superior performance and efficiency compared to traditional rule-based\nand feature engineering oriented solutions. However, it remains unclear whether\nthe suggested neural solutions are sufficient replacements of human raters as\nwe find recent works do not properly account for rubric items that are\nessential for automated essay scoring during model training and validation. In\nthis paper, we propose a series of data augmentation operations that train and\ntest an automated scoring model to learn features and functions overlooked by\nprevious works while still achieving state-of-the-art performance in the\nAutomated Student Assessment Prize dataset.\n","authors":["Brian Cho","Youngbin Jang","Jaewoong Yoon"],"pdf_url":"https://arxiv.org/pdf/2309.02740v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2309.02731v1","updated":"2023-09-06T05:33:57Z","published":"2023-09-06T05:33:57Z","title":"HC3 Plus: A Semantic-Invariant Human ChatGPT Comparison Corpus","summary":" ChatGPT has gained significant interest due to its impressive performance,\nbut people are increasingly concerned about its potential risks, particularly\naround the detection of AI-generated content (AIGC), which is often difficult\nfor untrained humans to identify. Current datasets utilized for detecting\nChatGPT-generated text primarily center around question-answering, yet they\ntend to disregard tasks that possess semantic-invariant properties, such as\nsummarization, translation, and paraphrasing. Our primary studies demonstrate\nthat detecting model-generated text on semantic-invariant tasks is more\ndifficult. To fill this gap, we introduce a more extensive and comprehensive\ndataset that considers more types of tasks than previous work, including\nsemantic-invariant tasks. In addition, the model after a large number of task\ninstruction fine-tuning shows a strong powerful performance. Owing to its\nprevious success, we further instruct fine-tuning Tk-instruct and built a more\npowerful detection system. Experimental results show that our proposed detector\noutperforms the previous state-of-the-art RoBERTa-based detector.\n","authors":["Zhenpeng Su","Xing Wu","Wei Zhou","Guangyuan Ma","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2309.02731v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02726v1","updated":"2023-09-06T05:19:41Z","published":"2023-09-06T05:19:41Z","title":"Large Language Models for Automated Open-domain Scientific Hypotheses\n Discovery","summary":" Hypothetical induction is recognized as the main reasoning type when\nscientists make observations about the world and try to propose hypotheses to\nexplain those observations. Past research on hypothetical induction has a\nlimited setting that (1) the observation annotations of the dataset are not raw\nweb corpus but are manually selected sentences (resulting in a close-domain\nsetting); and (2) the ground truth hypotheses annotations are mostly\ncommonsense knowledge, making the task less challenging. In this work, we\npropose the first NLP dataset for social science academic hypotheses discovery,\nconsisting of 50 recent papers published in top social science journals. Raw\nweb corpora that are necessary for developing hypotheses in the published\npapers are also collected in the dataset, with the final goal of creating a\nsystem that automatically generates valid, novel, and helpful (to human\nresearchers) hypotheses, given only a pile of raw web corpora. The new dataset\ncan tackle the previous problems because it requires to (1) use raw web corpora\nas observations; and (2) propose hypotheses even new to humanity. A\nmulti-module framework is developed for the task, as well as three different\nfeedback mechanisms that empirically show performance gain over the base\nframework. Finally, our framework exhibits high performance in terms of both\nGPT-4 based evaluation and social science expert evaluation.\n","authors":["Zonglin Yang","Xinya Du","Junxian Li","Jie Zheng","Soujanya Poria","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2309.02726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02724v1","updated":"2023-09-06T05:18:43Z","published":"2023-09-06T05:18:43Z","title":"Offensive Hebrew Corpus and Detection using BERT","summary":" Offensive language detection has been well studied in many languages, but it\nis lagging behind in low-resource languages, such as Hebrew. In this paper, we\npresent a new offensive language corpus in Hebrew. A total of 15,881 tweets\nwere retrieved from Twitter. Each was labeled with one or more of five classes\n(abusive, hate, violence, pornographic, or none offensive) by Arabic-Hebrew\nbilingual speakers. The annotation process was challenging as each annotator is\nexpected to be familiar with the Israeli culture, politics, and practices to\nunderstand the context of each tweet. We fine-tuned two Hebrew BERT models,\nHeBERT and AlephBERT, using our proposed dataset and another published dataset.\nWe observed that our data boosts HeBERT performance by 2% when combined with\nD_OLaH. Fine-tuning AlephBERT on our data and testing on D_OLaH yields 69%\naccuracy, while fine-tuning on D_OLaH and testing on our data yields 57%\naccuracy, which may be an indication to the generalizability our data offers.\nOur dataset and fine-tuned models are available on GitHub and Huggingface.\n","authors":["Nagham Hamad","Mustafa Jarrar","Mohammad Khalilia","Nadim Nashif"],"pdf_url":"https://arxiv.org/pdf/2309.02724v1.pdf","comment":"8 pages, 1 figure, The 20th ACS/IEEE International Conference on\n Computer Systems and Applications (AICCSA)"},{"id":"http://arxiv.org/abs/2302.06132v2","updated":"2023-09-06T04:53:24Z","published":"2023-02-13T06:38:25Z","title":"NNKGC: Improving Knowledge Graph Completion with Node Neighborhoods","summary":" Knowledge graph completion (KGC) aims to discover missing relations of query\nentities. Current text-based models utilize the entity name and description to\ninfer the tail entity given the head entity and a certain relation. Existing\napproaches also consider the neighborhood of the head entity. However, these\nmethods tend to model the neighborhood using a flat structure and are only\nrestricted to 1-hop neighbors. In this work, we propose a node\nneighborhood-enhanced framework for knowledge graph completion. It models the\nhead entity neighborhood from multiple hops using graph neural networks to\nenrich the head node information. Moreover, we introduce an additional edge\nlink prediction task to improve KGC. Evaluation on two public datasets shows\nthat this framework is simple yet effective. The case study also shows that the\nmodel is able to predict explainable predictions.\n","authors":["Irene Li","Boming Yang"],"pdf_url":"https://arxiv.org/pdf/2302.06132v2.pdf","comment":"DL4KG Workshop, ISWC 2023"},{"id":"http://arxiv.org/abs/2309.02706v1","updated":"2023-09-06T04:38:16Z","published":"2023-09-06T04:38:16Z","title":"HAE-RAE Bench: Evaluation of Korean Knowledge in Language Models","summary":" Large Language Models (LLMs) pretrained on massive corpora exhibit remarkable\ncapabilities across a wide range of tasks, however, the attention given to\nnon-English languages has been limited in this field of research. To address\nthis gap and assess the proficiency of language models in the Korean language\nand culture, we present HAE-RAE Bench, covering 6 tasks including vocabulary,\nhistory, and general knowledge. Our evaluation of language models on this\nbenchmark highlights the potential advantages of employing Large\nLanguage-Specific Models(LLSMs) over a comprehensive, universal model like\nGPT-3.5. Remarkably, our study reveals that models approximately 13 times\nsmaller than GPT-3.5 can exhibit similar performance levels in terms of\nlanguage-specific knowledge retrieval. This observation underscores the\nimportance of homogeneous corpora for training professional-level\nlanguage-specific models. On the contrary, we also observe a perplexing\nperformance dip in these smaller LMs when they are tasked to generate\nstructured answers.\n","authors":["Guijin Son","Hanwool Lee","Suwan Kim","Jaecheol Lee","Je Won Yeom","Jihyu Jung","Jung Woo Kim","Songseong Kim"],"pdf_url":"https://arxiv.org/pdf/2309.02706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02705v1","updated":"2023-09-06T04:37:20Z","published":"2023-09-06T04:37:20Z","title":"Certifying LLM Safety against Adversarial Prompting","summary":" Large language models (LLMs) released for public use incorporate guardrails\nto ensure their output is safe, often referred to as \"model alignment.\" An\naligned language model should decline a user's request to produce harmful\ncontent. However, such safety measures are vulnerable to adversarial prompts,\nwhich contain maliciously designed token sequences to circumvent the model's\nsafety guards and cause it to produce harmful content. In this work, we\nintroduce erase-and-check, the first framework to defend against adversarial\nprompts with verifiable safety guarantees. We erase tokens individually and\ninspect the resulting subsequences using a safety filter. Our procedure labels\nthe input prompt as harmful if any subsequences or the input prompt are\ndetected as harmful by the filter. This guarantees that any adversarial\nmodification of a harmful prompt up to a certain size is also labeled harmful.\nWe defend against three attack modes: i) adversarial suffix, which appends an\nadversarial sequence at the end of the prompt; ii) adversarial insertion, where\nthe adversarial sequence is inserted anywhere in the middle of the prompt; and\niii) adversarial infusion, where adversarial tokens are inserted at arbitrary\npositions in the prompt, not necessarily as a contiguous block. Empirical\nresults demonstrate that our technique obtains strong certified safety\nguarantees on harmful prompts while maintaining good performance on safe\nprompts. For example, against adversarial suffixes of length 20, it certifiably\ndetects 93% of the harmful prompts and labels 94% of the safe prompts as safe\nusing the open source language model Llama 2 as the safety filter.\n","authors":["Aounon Kumar","Chirag Agarwal","Suraj Srinivas","Soheil Feizi","Hima Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2309.02705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.01824v2","updated":"2023-09-06T04:08:08Z","published":"2022-09-05T08:19:26Z","title":"A Survey on Measuring and Mitigating Reasoning Shortcuts in Machine\n Reading Comprehension","summary":" The issue of shortcut learning is widely known in NLP and has been an\nimportant research focus in recent years. Unintended correlations in the data\nenable models to easily solve tasks that were meant to exhibit advanced\nlanguage understanding and reasoning capabilities. In this survey paper, we\nfocus on the field of machine reading comprehension (MRC), an important task\nfor showcasing high-level language understanding that also suffers from a range\nof shortcuts. We summarize the available techniques for measuring and\nmitigating shortcuts and conclude with suggestions for further progress in\nshortcut research. Importantly, we highlight two concerns for shortcut\nmitigation in MRC: (1) the lack of public challenge sets, a necessary component\nfor effective and reusable evaluation, and (2) the lack of certain mitigation\ntechniques that are prominent in other areas.\n","authors":["Xanh Ho","Johannes Mario Meissner","Saku Sugawara","Akiko Aizawa"],"pdf_url":"https://arxiv.org/pdf/2209.01824v2.pdf","comment":"18 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2309.02403v2","updated":"2023-09-06T04:02:55Z","published":"2023-09-05T17:33:59Z","title":"Substitution-based Semantic Change Detection using Contextual Embeddings","summary":" Measuring semantic change has thus far remained a task where methods using\ncontextual embeddings have struggled to improve upon simpler techniques relying\nonly on static word vectors. Moreover, many of the previously proposed\napproaches suffer from downsides related to scalability and ease of\ninterpretation. We present a simplified approach to measuring semantic change\nusing contextual embeddings, relying only on the most probable substitutes for\nmasked terms. Not only is this approach directly interpretable, it is also far\nmore efficient in terms of storage, achieves superior average performance\nacross the most frequently cited datasets for this task, and allows for more\nnuanced investigation of change than is possible with static word vectors.\n","authors":["Dallas Card"],"pdf_url":"https://arxiv.org/pdf/2309.02403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02691v1","updated":"2023-09-06T03:54:57Z","published":"2023-09-06T03:54:57Z","title":"A Joint Study of Phrase Grounding and Task Performance in Vision and\n Language Models","summary":" Key to tasks that require reasoning about natural language in visual contexts\nis grounding words and phrases to image regions. However, observing this\ngrounding in contemporary models is complex, even if it is generally expected\nto take place if the task is addressed in a way that is conductive to\ngeneralization. We propose a framework to jointly study task performance and\nphrase grounding, and propose three benchmarks to study the relation between\nthe two. Our results show that contemporary models demonstrate inconsistency\nbetween their ability to ground phrases and solve tasks. We show how this can\nbe addressed through brute-force training on ground phrasing annotations, and\nanalyze the dynamics it creates. Code and at available at\nhttps://github.com/lil-lab/phrase_grounding.\n","authors":["Noriyuki Kojima","Hadar Averbuch-Elor","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2309.02691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00526v3","updated":"2023-09-06T03:30:14Z","published":"2023-06-01T10:28:12Z","title":"Layout and Task Aware Instruction Prompt for Zero-shot Document Image\n Question Answering","summary":" The pre-training-fine-tuning paradigm based on layout-aware multimodal\npre-trained models has achieved significant progress on document image question\nanswering. However, domain pre-training and task fine-tuning for additional\nvisual, layout, and task modules prevent them from directly utilizing\noff-the-shelf instruction-tuning language foundation models, which have\nrecently shown promising potential in zero-shot learning. Contrary to aligning\nlanguage models to the domain of document image question answering, we align\ndocument image question answering to off-the-shell instruction-tuning language\nfoundation models to utilize their zero-shot capability. Specifically, we\npropose layout and task aware instruction prompt called LATIN-Prompt, which\nconsists of layout-aware document content and task-aware descriptions. The\nformer recovers the layout information among text segments from OCR tools by\nappropriate spaces and line breaks. The latter ensures that the model generates\nanswers that meet the requirements, especially format requirements, through a\ndetailed description of task. Experimental results on three benchmarks show\nthat LATIN-Prompt can improve the zero-shot performance of instruction-tuning\nlanguage foundation models on document image question answering and help them\nachieve comparable levels to SOTAs based on the pre-training-fine-tuning\nparadigm. Quantitative analysis and qualitative analysis demonstrate the\neffectiveness of LATIN-Prompt. We provide the code in supplementary and will\nrelease the code to facilitate future research.\n","authors":["Wenjin Wang","Yunhao Li","Yixin Ou","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.00526v3.pdf","comment":"Add the LATIN-Tuning for Alapca. Code is available at\n https://github.com/WenjinW/LATIN-Prompt"},{"id":"http://arxiv.org/abs/2309.02654v1","updated":"2023-09-06T01:57:36Z","published":"2023-09-06T01:57:36Z","title":"Zero-Resource Hallucination Prevention for Large Language Models","summary":" The prevalent use of large language models (LLMs) in various domains has\ndrawn attention to the issue of \"hallucination,\" which refers to instances\nwhere LLMs generate factually inaccurate or ungrounded information. Existing\ntechniques for hallucination detection in language assistants rely on intricate\nfuzzy, specific free-language-based chain of thought (CoT) techniques or\nparameter-based methods that suffer from interpretability issues. Additionally,\nthe methods that identify hallucinations post-generation could not prevent\ntheir occurrence and suffer from inconsistent performance due to the influence\nof the instruction format and model style. In this paper, we introduce a novel\npre-detection self-evaluation technique, referred to as {\\method}, which\nfocuses on evaluating the model's familiarity with the concepts present in the\ninput instruction and withholding the generation of response in case of\nunfamiliar concepts. This approach emulates the human ability to refrain from\nresponding to unfamiliar topics, thus reducing hallucinations. We validate\n{\\method} across four different large language models, demonstrating\nconsistently superior performance compared to existing techniques. Our findings\npropose a significant shift towards preemptive strategies for hallucination\nmitigation in LLM assistants, promising improvements in reliability,\napplicability, and interpretability.\n","authors":["Junyu Luo","Cao Xiao","Fenglong Ma"],"pdf_url":"https://arxiv.org/pdf/2309.02654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02640v1","updated":"2023-09-06T00:59:27Z","published":"2023-09-06T00:59:27Z","title":"Epi-Curriculum: Episodic Curriculum Learning for Low-Resource Domain\n Adaptation in Neural Machine Translation","summary":" Neural Machine Translation (NMT) models have become successful, but their\nperformance remains poor when translating on new domains with a limited number\nof data. In this paper, we present a novel approach Epi-Curriculum to address\nlow-resource domain adaptation (DA), which contains a new episodic training\nframework along with denoised curriculum learning. Our episodic training\nframework enhances the model's robustness to domain shift by episodically\nexposing the encoder/decoder to an inexperienced decoder/encoder. The denoised\ncurriculum learning filters the noised data and further improves the model's\nadaptability by gradually guiding the learning process from easy to more\ndifficult tasks. Experiments on English-German and English-Romanian translation\nshow that: (i) Epi-Curriculum improves both model's robustness and adaptability\nin seen and unseen domains; (ii) Our episodic training framework enhances the\nencoder and decoder's robustness to domain shift.\n","authors":["Keyu Chen","Di Zhuang","Mingchen Li","J. Morris Chang"],"pdf_url":"https://arxiv.org/pdf/2309.02640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10410v2","updated":"2023-09-06T00:03:11Z","published":"2023-08-21T01:32:45Z","title":"Large Language Models on Wikipedia-Style Survey Generation: an\n Evaluation in NLP Concepts","summary":" Large Language Models (LLMs) have achieved significant success across various\nnatural language processing (NLP) tasks, encompassing question-answering,\nsummarization, and machine translation, among others. While LLMs excel in\ngeneral tasks, their efficacy in domain-specific applications remains under\nexploration. Additionally, LLM-generated text sometimes exhibits issues like\nhallucination and disinformation. In this study, we assess LLMs' capability of\nproducing concise survey articles within the computer science-NLP domain,\nfocusing on 20 chosen topics. Automated evaluations indicate that GPT-4\noutperforms GPT-3.5 when benchmarked against the ground truth. Furthermore,\nfour human evaluators provide insights from six perspectives across four model\nconfigurations. Through case studies, we demonstrate that while GPT often\nyields commendable results, there are instances of shortcomings, such as\nincomplete information and the exhibition of lapses in factual accuracy.\n","authors":["Fan Gao","Hang Jiang","Moritz Blum","Jinghui Lu","Dairui Liu","Yuang Jiang","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2308.10410v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02110v2","updated":"2023-09-06T23:23:49Z","published":"2023-09-05T10:38:53Z","title":"Wordle: A Microcosm of Life. Luck, Skill, Cheating, Loyalty, and\n Influence!","summary":" Wordle is a popular, online word game offered by the New York Times\n(nytimes.com). Currently there are some 2 million players of the English\nversion worldwide. Players have 6 attempts to guess the daily word (target\nword) and after each attempt, the player receives color-coded information about\nthe correctness and position of each letter in the guess. After either a\nsuccessful completion of the puzzle or the final unsuccessful attempt, software\ncan assess the player's luck and skill using Information Theory and can display\ndata for the first, second, ..., sixth guesses of a random sample of all\nplayers. Recently, I discovered that the latter data is presented in a format\nthat can easily be copied and pasted into a spreadsheet. I compiled data on\nWordle players' first guesses from May 2023 - August 2023 and inferred some\ninteresting information about Wordle players. A) Every day, about 0.2-0.5% of\nplayers solve the puzzle in one attempt. Because the odds of guessing the one\nof 2,315 possible target words at random is 0.043%, this implies that 4,000 -\n10,000 players cheat by obtaining the target word outside of playing the game!\nB) At least 1/3 of the players have a favorite starting word, or cycle through\nseveral. And even though players should be aware that target words are never\nrepeated, most players appear to remain loyal to their starting word even after\nits appearance as a target word. C) On August 15, 2023, about 30,000 players\nabruptly changed their starting word, presumably based on a crossword puzzle\nclue! Wordle players can be influenced! This study goes beyond social media\npostings, surveys, and Google Trends to provide solid, quantitative evidence\nabout cheating in Wordle.\n","authors":["James P. Dilger"],"pdf_url":"https://arxiv.org/pdf/2309.02110v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04014v2","updated":"2023-09-06T23:13:07Z","published":"2023-08-08T03:18:18Z","title":"Continual Pre-Training of Large Language Models: How to (re)warm your\n model?","summary":" Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to restart the process over again once new data becomes available. A much\ncheaper and more efficient solution would be to enable the continual\npre-training of these models, i.e. updating pre-trained models with new data\ninstead of re-training them from scratch. However, the distribution shift\ninduced by novel data typically results in degraded performance on past data.\nTaking a step towards efficient continual pre-training, in this work, we\nexamine the effect of different warm-up strategies. Our hypothesis is that the\nlearning rate must be re-increased to improve compute efficiency when training\non a new dataset. We study the warmup phase of models pre-trained on the Pile\n(upstream data, 300B tokens) as we continue to pre-train on SlimPajama\n(downstream data, 297B tokens), following a linear warmup and cosine decay\nschedule. We conduct all experiments on the Pythia 410M language model\narchitecture and evaluate performance through validation perplexity. We\nexperiment with different pre-training checkpoints, various maximum learning\nrates, and various warmup lengths. Our results show that while rewarming models\nfirst increases the loss on upstream and downstream data, in the longer run it\nimproves the downstream performance, outperforming models trained from\nscratch$\\unicode{x2013}$even for a large downstream dataset.\n","authors":["Kshitij Gupta","Benjamin Thérien","Adam Ibrahim","Mats L. Richter","Quentin Anthony","Eugene Belilovsky","Irina Rish","Timothée Lesort"],"pdf_url":"https://arxiv.org/pdf/2308.04014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03378v1","updated":"2023-09-06T21:56:24Z","published":"2023-09-06T21:56:24Z","title":"RoDia: A New Dataset for Romanian Dialect Identification from Speech","summary":" Dialect identification is a critical task in speech processing and language\ntechnology, enhancing various applications such as speech recognition, speaker\nverification, and many others. While most research studies have been dedicated\nto dialect identification in widely spoken languages, limited attention has\nbeen given to dialect identification in low-resource languages, such as\nRomanian. To address this research gap, we introduce RoDia, the first dataset\nfor Romanian dialect identification from speech. The RoDia dataset includes a\nvaried compilation of speech samples from five distinct regions of Romania,\ncovering both urban and rural environments, totaling 2 hours of manually\nannotated speech data. Along with our dataset, we introduce a set of\ncompetitive models to be used as baselines for future research. The top scoring\nmodel achieves a macro F1 score of 59.83% and a micro F1 score of 62.08%,\nindicating that the task is challenging. We thus believe that RoDia is a\nvaluable resource that will stimulate research aiming to address the challenges\nof Romanian dialect identification. We publicly release our dataset and code at\nhttps://github.com/codrut2/RoDia.\n","authors":["Codrut Rotaru","Nicolae-Catalin Ristea","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2309.03378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12191v4","updated":"2023-09-06T19:58:27Z","published":"2022-01-28T15:45:13Z","title":"Kernelized Concept Erasure","summary":" The representation space of neural models for textual data emerges in an\nunsupervised manner during training. Understanding how those representations\nencode human-interpretable concepts is a fundamental problem. One prominent\napproach for the identification of concepts in neural representations is\nsearching for a linear subspace whose erasure prevents the prediction of the\nconcept from the representations. However, while many linear erasure algorithms\nare tractable and interpretable, neural networks do not necessarily represent\nconcepts in a linear manner. To identify non-linearly encoded concepts, we\npropose a kernelization of a linear minimax game for concept erasure. We\ndemonstrate that it is possible to prevent specific non-linear adversaries from\npredicting the concept. However, the protection does not transfer to different\nnonlinear adversaries. Therefore, exhaustively erasing a non-linearly encoded\nconcept remains an open problem.\n","authors":["Shauli Ravfogel","Francisco Vargas","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2201.12191v4.pdf","comment":"Accepted as a long paper in EMNLP22"},{"id":"http://arxiv.org/abs/2309.03340v1","updated":"2023-09-06T19:42:52Z","published":"2023-09-06T19:42:52Z","title":"Parameter Efficient Audio Captioning With Faithful Guidance Using\n Audio-text Shared Latent Representation","summary":" There has been significant research on developing pretrained transformer\narchitectures for multimodal-to-text generation tasks. Albeit performance\nimprovements, such models are frequently overparameterized, hence suffer from\nhallucination and large memory footprint making them challenging to deploy on\nedge devices. In this paper, we address both these issues for the application\nof automated audio captioning. First, we propose a data augmentation technique\nfor generating hallucinated audio captions and show that similarity based on an\naudio-text shared latent space is suitable for detecting hallucination. Then,\nwe propose a parameter efficient inference time faithful decoding algorithm\nthat enables smaller audio captioning models with performance equivalent to\nlarger models trained with more data. During the beam decoding step, the\nsmaller model utilizes an audio-text shared latent representation to\nsemantically align the generated text with corresponding input audio. Faithful\nguidance is introduced into the beam probability by incorporating the cosine\nsimilarity between latent representation projections of greedy rolled out\nintermediate beams and audio clip. We show the efficacy of our algorithm on\nbenchmark datasets and evaluate the proposed scheme against baselines using\nconventional audio captioning and semantic similarity metrics while\nillustrating tradeoffs between performance and complexity.\n","authors":["Arvind Krishna Sridhar","Yinyi Guo","Erik Visser","Rehana Mahfuz"],"pdf_url":"https://arxiv.org/pdf/2309.03340v1.pdf","comment":"5 pages, 5 tables, 1 figure"},{"id":"http://arxiv.org/abs/2309.00237v2","updated":"2023-09-06T18:11:15Z","published":"2023-09-01T04:01:20Z","title":"Publicly Shareable Clinical Large Language Model Built on Synthetic\n Clinical Notes","summary":" The development of large language models tailored for handling patients'\nclinical notes is often hindered by the limited accessibility and usability of\nthese notes due to strict privacy regulations. To address these challenges, we\nfirst create synthetic large-scale clinical notes using publicly available case\nreports extracted from biomedical literature. We then use these synthetic notes\nto train our specialized clinical large language model, Asclepius. While\nAsclepius is trained on synthetic data, we assess its potential performance in\nreal-world applications by evaluating it using real clinical notes. We\nbenchmark Asclepius against several other large language models, including\nGPT-3.5-turbo and other open-source alternatives. To further validate our\napproach using synthetic notes, we also compare Asclepius with its variants\ntrained on real clinical notes. Our findings convincingly demonstrate that\nsynthetic clinical notes can serve as viable substitutes for real ones when\nconstructing high-performing clinical language models. This conclusion is\nsupported by detailed evaluations conducted by both GPT-4 and medical\nprofessionals. All resources including weights, codes, and data used in the\ndevelopment of Asclepius are made publicly accessible for future research.\n","authors":["Sunjun Kweon","Junu Kim","Jiyoun Kim","Sujeong Im","Eunbyeol Cho","Seongsu Bae","Jungwoo Oh","Gyubok Lee","Jong Hak Moon","Seng Chan You","Seungjin Baek","Chang Hoon Han","Yoon Bin Jung","Yohan Jo","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2309.00237v2.pdf","comment":"https://github.com/starmpcc/Asclepius"},{"id":"http://arxiv.org/abs/2309.03241v1","updated":"2023-09-06T06:18:16Z","published":"2023-09-06T06:18:16Z","title":"GPT Can Solve Mathematical Problems Without a Calculator","summary":" Previous studies have typically assumed that large language models are unable\nto accurately perform arithmetic operations, particularly multiplication of >8\ndigits, and operations involving decimals and fractions, without the use of\ncalculator tools. This paper aims to challenge this misconception. With\nsufficient training data, a 2 billion-parameter language model can accurately\nperform multi-digit arithmetic operations with almost 100% accuracy without\ndata leakage, significantly surpassing GPT-4 (whose multi-digit multiplication\naccuracy is only 4.3%). We also demonstrate that our MathGLM, fine-tuned from\nGLM-10B on a dataset with additional multi-step arithmetic operations and math\nproblems described in text, achieves similar performance to GPT-4 on a\n5,000-samples Chinese math problem test set.\n","authors":["Zhen Yang","Ming Ding","Qingsong Lv","Zhihuan Jiang","Zehai He","Yuyi Guo","Jinfeng Bai","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2309.03241v1.pdf","comment":"26pages,14figures"},{"id":"http://arxiv.org/abs/2309.03238v1","updated":"2023-09-06T02:45:42Z","published":"2023-09-06T02:45:42Z","title":"Implicit Design Choices and Their Impact on Emotion Recognition Model\n Development and Evaluation","summary":" Emotion recognition is a complex task due to the inherent subjectivity in\nboth the perception and production of emotions. The subjectivity of emotions\nposes significant challenges in developing accurate and robust computational\nmodels. This thesis examines critical facets of emotion recognition, beginning\nwith the collection of diverse datasets that account for psychological factors\nin emotion production.\n To handle the challenge of non-representative training data, this work\ncollects the Multimodal Stressed Emotion dataset, which introduces controlled\nstressors during data collection to better represent real-world influences on\nemotion production. To address issues with label subjectivity, this research\ncomprehensively analyzes how data augmentation techniques and annotation\nschemes impact emotion perception and annotator labels. It further handles\nnatural confounding variables and variations by employing adversarial networks\nto isolate key factors like stress from learned emotion representations during\nmodel training. For tackling concerns about leakage of sensitive demographic\nvariables, this work leverages adversarial learning to strip sensitive\ndemographic information from multimodal encodings. Additionally, it proposes\noptimized sociological evaluation metrics aligned with cost-effective,\nreal-world needs for model testing.\n This research advances robust, practical emotion recognition through\nmultifaceted studies of challenges in datasets, labels, modeling, demographic\nand membership variable encoding in representations, and evaluation. The\ngroundwork has been laid for cost-effective, generalizable emotion recognition\nmodels that are less likely to encode sensitive demographic information.\n","authors":["Mimansa Jaiswal"],"pdf_url":"https://arxiv.org/pdf/2309.03238v1.pdf","comment":"PhD Thesis"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.03198v1","updated":"2023-09-06T17:59:47Z","published":"2023-09-06T17:59:47Z","title":"My Art My Choice: Adversarial Protection Against Unruly AI","summary":" Generative AI is on the rise, enabling everyone to produce realistic content\nvia publicly available interfaces. Especially for guided image generation,\ndiffusion models are changing the creator economy by producing high quality low\ncost content. In parallel, artists are rising against unruly AI, since their\nartwork are leveraged, distributed, and dissimulated by large generative\nmodels. Our approach, My Art My Choice (MAMC), aims to empower content owners\nby protecting their copyrighted materials from being utilized by diffusion\nmodels in an adversarial fashion. MAMC learns to generate adversarially\nperturbed \"protected\" versions of images which can in turn \"break\" diffusion\nmodels. The perturbation amount is decided by the artist to balance distortion\nvs. protection of the content. MAMC is designed with a simple UNet-based\ngenerator, attacking black box diffusion models, combining several losses to\ncreate adversarial twins of the original artwork. We experiment on three\ndatasets for various image-to-image tasks, with different user control values.\nBoth protected image and diffusion output results are evaluated in visual,\nnoise, structure, pixel, and generative spaces to validate our claims. We\nbelieve that MAMC is a crucial step for preserving ownership information for AI\ngenerated content in a flawless, based-on-need, and human-centric way.\n","authors":["Anthony Rhodes","Ram Bhagat","Umur Aybars Ciftci","Ilke Demir"],"pdf_url":"https://arxiv.org/pdf/2309.03198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03185v1","updated":"2023-09-06T17:44:34Z","published":"2023-09-06T17:44:34Z","title":"Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields","summary":" Neural Radiance Fields (NeRFs) have shown promise in applications like view\nsynthesis and depth estimation, but learning from multiview images faces\ninherent uncertainties. Current methods to quantify them are either heuristic\nor computationally demanding. We introduce BayesRays, a post-hoc framework to\nevaluate uncertainty in any pre-trained NeRF without modifying the training\nprocess. Our method establishes a volumetric uncertainty field using spatial\nperturbations and a Bayesian Laplace approximation. We derive our algorithm\nstatistically and show its superior performance in key metrics and\napplications. Additional results available at: https://bayesrays.github.io.\n","authors":["Lily Goli","Cody Reading","Silvia Selllán","Alec Jacobson","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2309.03185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.12587v4","updated":"2023-09-06T17:43:41Z","published":"2023-04-25T05:44:50Z","title":"MF-NeRF: Memory Efficient NeRF with Mixed-Feature Hash Table","summary":" Neural radiance field (NeRF) has shown remarkable performance in generating\nphoto-realistic novel views. Among recent NeRF related research, the approaches\nthat involve the utilization of explicit structures like grids to manage\nfeatures achieve exceptionally fast training by reducing the complexity of\nmultilayer perceptron (MLP) networks. However, storing features in dense grids\ndemands a substantial amount of memory space, resulting in a notable memory\nbottleneck within computer system. Consequently, it leads to a significant\nincrease in training times without prior hyper-parameter tuning. To address\nthis issue, in this work, we are the first to propose MF-NeRF, a\nmemory-efficient NeRF framework that employs a Mixed-Feature hash table to\nimprove memory efficiency and reduce training time while maintaining\nreconstruction quality. Specifically, we first design a mixed-feature hash\nencoding to adaptively mix part of multi-level feature grids and map it to a\nsingle hash table. Following that, in order to obtain the correct index of a\ngrid point, we further develop an index transformation method that transforms\nindices of an arbitrary level grid to those of a canonical grid. Extensive\nexperiments benchmarking with state-of-the-art Instant-NGP, TensoRF, and DVGO,\nindicate our MF-NeRF could achieve the fastest training time on the same GPU\nhardware with similar or even higher reconstruction quality.\n","authors":["Yongjae Lee","Li Yang","Deliang Fan"],"pdf_url":"https://arxiv.org/pdf/2304.12587v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03183v1","updated":"2023-09-06T17:42:18Z","published":"2023-09-06T17:42:18Z","title":"3D Transformer based on deformable patch location for differential\n diagnosis between Alzheimer's disease and Frontotemporal dementia","summary":" Alzheimer's disease and Frontotemporal dementia are common types of\nneurodegenerative disorders that present overlapping clinical symptoms, making\ntheir differential diagnosis very challenging. Numerous efforts have been done\nfor the diagnosis of each disease but the problem of multi-class differential\ndiagnosis has not been actively explored. In recent years, transformer-based\nmodels have demonstrated remarkable success in various computer vision tasks.\nHowever, their use in disease diagnostic is uncommon due to the limited amount\nof 3D medical data given the large size of such models. In this paper, we\npresent a novel 3D transformer-based architecture using a deformable patch\nlocation module to improve the differential diagnosis of Alzheimer's disease\nand Frontotemporal dementia. Moreover, to overcome the problem of data\nscarcity, we propose an efficient combination of various data augmentation\ntechniques, adapted for training transformer-based models on 3D structural\nmagnetic resonance imaging data. Finally, we propose to combine our\ntransformer-based model with a traditional machine learning model using brain\nstructure volumes to better exploit the available data. Our experiments\ndemonstrate the effectiveness of the proposed approach, showing competitive\nresults compared to state-of-the-art methods. Moreover, the deformable patch\nlocations can be visualized, revealing the most relevant brain regions used to\nestablish the diagnosis of each disease.\n","authors":["Huy-Dung Nguyen","Michaël Clément","Boris Mansencal","Pierrick Coupé"],"pdf_url":"https://arxiv.org/pdf/2309.03183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03179v1","updated":"2023-09-06T17:39:05Z","published":"2023-09-06T17:39:05Z","title":"SLiMe: Segment Like Me","summary":" Significant strides have been made using large vision-language models, like\nStable Diffusion (SD), for a variety of downstream tasks, including image\nediting, image correspondence, and 3D shape generation. Inspired by these\nadvancements, we explore leveraging these extensive vision-language models for\nsegmenting images at any desired granularity using as few as one annotated\nsample by proposing SLiMe. SLiMe frames this problem as an optimization task.\nSpecifically, given a single training image and its segmentation mask, we first\nextract attention maps, including our novel \"weighted accumulated\nself-attention map\" from the SD prior. Then, using the extracted attention\nmaps, the text embeddings of Stable Diffusion are optimized such that, each of\nthem, learn about a single segmented region from the training image. These\nlearned embeddings then highlight the segmented region in the attention maps,\nwhich in turn can then be used to derive the segmentation map. This enables\nSLiMe to segment any real-world image during inference with the granularity of\nthe segmented region in the training image, using just one example. Moreover,\nleveraging additional training data when available, i.e. few-shot, improves the\nperformance of SLiMe. We carried out a knowledge-rich set of experiments\nexamining various design factors and showed that SLiMe outperforms other\nexisting one-shot and few-shot segmentation methods.\n","authors":["Aliasghar Khani","Saeid Asgari Taghanaki","Aditya Sanghi","Ali Mahdavi Amiri","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2309.03179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13093v2","updated":"2023-09-06T17:31:17Z","published":"2023-08-24T21:36:11Z","title":"EgoBlur: Responsible Innovation in Aria","summary":" Project Aria pushes the frontiers of Egocentric AI with large-scale\nreal-world data collection using purposely designed glasses with privacy first\napproach. To protect the privacy of bystanders being recorded by the glasses,\nour research protocols are designed to ensure recorded video is processed by an\nAI anonymization model that removes bystander faces and vehicle license plates.\nDetected face and license plate regions are processed with a Gaussian blur such\nthat these personal identification information (PII) regions are obscured. This\nprocess helps to ensure that anonymized versions of the video is retained for\nresearch purposes. In Project Aria, we have developed a state-of-the-art\nanonymization system EgoBlur. In this paper, we present extensive analysis of\nEgoBlur on challenging datasets comparing its performance with other\nstate-of-the-art systems from industry and academia including extensive\nResponsible AI analysis on recently released Casual Conversations V2 dataset.\n","authors":["Nikhil Raina","Guruprasad Somasundaram","Kang Zheng","Sagar Miglani","Steve Saarinen","Jeff Meissner","Mark Schwesinger","Luis Pesqueira","Ishita Prasad","Edward Miller","Prince Gupta","Mingfei Yan","Richard Newcombe","Carl Ren","Omkar M Parkhi"],"pdf_url":"https://arxiv.org/pdf/2308.13093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03177v1","updated":"2023-09-06T17:30:26Z","published":"2023-09-06T17:30:26Z","title":"3D Object Positioning Using Differentiable Multimodal Learning","summary":" This article describes a multi-modal method using simulated Lidar data via\nray tracing and image pixel loss with differentiable rendering to optimize an\nobject's position with respect to an observer or some referential objects in a\ncomputer graphics scene. Object position optimization is completed using\ngradient descent with the loss function being influenced by both modalities.\nTypical object placement optimization is done using image pixel loss with\ndifferentiable rendering only, this work shows the use of a second modality\n(Lidar) leads to faster convergence. This method of fusing sensor input\npresents a potential usefulness for autonomous vehicles, as these methods can\nbe used to establish the locations of multiple actors in a scene. This article\nalso presents a method for the simulation of multiple types of data to be used\nin the training of autonomous vehicles.\n","authors":["Sean Zanyk-McLean","Krishna Kumar","Paul Navratil"],"pdf_url":"https://arxiv.org/pdf/2309.03177v1.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.03173v1","updated":"2023-09-06T17:19:29Z","published":"2023-09-06T17:19:29Z","title":"PDiscoNet: Semantically consistent part discovery for fine-grained\n recognition","summary":" Fine-grained classification often requires recognizing specific object parts,\nsuch as beak shape and wing patterns for birds. Encouraging a fine-grained\nclassification model to first detect such parts and then using them to infer\nthe class could help us gauge whether the model is indeed looking at the right\ndetails better than with interpretability methods that provide a single\nattribution map. We propose PDiscoNet to discover object parts by using only\nimage-level class labels along with priors encouraging the parts to be:\ndiscriminative, compact, distinct from each other, equivariant to rigid\ntransforms, and active in at least some of the images. In addition to using the\nappropriate losses to encode these priors, we propose to use part-dropout,\nwhere full part feature vectors are dropped at once to prevent a single part\nfrom dominating in the classification, and part feature vector modulation,\nwhich makes the information coming from each part distinct from the perspective\nof the classifier. Our results on CUB, CelebA, and PartImageNet show that the\nproposed method provides substantially better part discovery performance than\nprevious methods while not requiring any additional hyper-parameter tuning and\nwithout penalizing the classification performance. The code is available at\nhttps://github.com/robertdvdk/part_detection.\n","authors":["Robert van der Klis","Stephan Alaniz","Massimiliano Mancini","Cassio F. Dantas","Dino Ienco","Zeynep Akata","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2309.03173v1.pdf","comment":"9 pages, 8 figures, ICCV"},{"id":"http://arxiv.org/abs/2309.03160v1","updated":"2023-09-06T16:59:36Z","published":"2023-09-06T16:59:36Z","title":"ResFields: Residual Neural Fields for Spatiotemporal Signals","summary":" Neural fields, a category of neural networks trained to represent\nhigh-frequency signals, have gained significant attention in recent years due\nto their impressive performance in modeling complex 3D data, especially large\nneural signed distance (SDFs) or radiance fields (NeRFs) via a single\nmulti-layer perceptron (MLP). However, despite the power and simplicity of\nrepresenting signals with an MLP, these methods still face challenges when\nmodeling large and complex temporal signals due to the limited capacity of\nMLPs. In this paper, we propose an effective approach to address this\nlimitation by incorporating temporal residual layers into neural fields, dubbed\nResFields, a novel class of networks specifically designed to effectively\nrepresent complex temporal signals. We conduct a comprehensive analysis of the\nproperties of ResFields and propose a matrix factorization technique to reduce\nthe number of trainable parameters and enhance generalization capabilities.\nImportantly, our formulation seamlessly integrates with existing techniques and\nconsistently improves results across various challenging tasks: 2D video\napproximation, dynamic shape modeling via temporal SDFs, and dynamic NeRF\nreconstruction. Lastly, we demonstrate the practical utility of ResFields by\nshowcasing its effectiveness in capturing dynamic 3D scenes from sparse sensory\ninputs of a lightweight capture system.\n","authors":["Marko Mihajlovic","Sergey Prokudin","Marc Pollefeys","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2309.03160v1.pdf","comment":"Project page and code at https://markomih.github.io/ResFields/"},{"id":"http://arxiv.org/abs/2307.02694v2","updated":"2023-09-06T16:53:24Z","published":"2023-07-05T23:53:55Z","title":"Loss Functions and Metrics in Deep Learning","summary":" One of the essential components of deep learning is the choice of the loss\nfunction and performance metrics used to train and evaluate models. This paper\nreviews the most prevalent loss functions and performance measurements in deep\nlearning. We examine the benefits and limits of each technique and illustrate\ntheir application to various deep-learning problems. Our review aims to give a\ncomprehensive picture of the different loss functions and performance\nindicators used in the most common deep learning tasks and help practitioners\nchoose the best method for their specific task.\n","authors":["Juan Terven","Diana M. Cordova-Esparza","Alfonso Ramirez-Pedraza","Edgar A. Chavez-Urbiola"],"pdf_url":"https://arxiv.org/pdf/2307.02694v2.pdf","comment":"53 pages, 5 figures, 7 tables, 86 equations"},{"id":"http://arxiv.org/abs/2211.13398v2","updated":"2023-09-06T16:47:31Z","published":"2022-11-24T03:27:00Z","title":"CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote\n Aggregation","summary":" Object pose estimation constitutes a critical area within the domain of 3D\nvision. While contemporary state-of-the-art methods that leverage real-world\npose annotations have demonstrated commendable performance, the procurement of\nsuch real-world training data incurs substantial costs. This paper focuses on a\nspecific setting wherein only 3D CAD models are utilized as a priori knowledge,\ndevoid of any background or clutter information. We introduce a novel method,\nCPPF++, designed for sim-to-real pose estimation. This method builds upon the\nfoundational point-pair voting scheme of CPPF, reconceptualizing it through a\nprobabilistic lens. To address the challenge of voting collision, we model\nvoting uncertainty by estimating the probabilistic distribution of each point\npair within the canonical space. This approach is further augmented by\niterative noise filtering, employed to eradicate votes associated with\nbackgrounds or clutters. Additionally, we enhance the context provided by each\nvoting unit by introducing $N$-point tuples. In conjunction with this\nmethodological contribution, we present a new category-level pose estimation\ndataset, DiversePose 300. This dataset is specifically crafted to facilitate a\nmore rigorous evaluation of current state-of-the-art methods, encompassing a\nbroader and more challenging array of real-world scenarios. Empirical results\nsubstantiate the efficacy of our proposed method, revealing a significant\nreduction in the disparity between simulation and real-world performance.\n","authors":["Yang You","Wenhao He","Jin Liu","Hongkai Xiong","Weiming Wang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2211.13398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04512v3","updated":"2023-09-06T16:05:50Z","published":"2023-04-10T11:05:20Z","title":"Defense-Prefix for Preventing Typographic Attacks on CLIP","summary":" Vision-language pre-training models (VLPs) have exhibited revolutionary\nimprovements in various vision-language tasks. In VLP, some adversarial attacks\nfool a model into false or absurd classifications. Previous studies addressed\nthese attacks by fine-tuning the model or changing its architecture. However,\nthese methods risk losing the original model's performance and are difficult to\napply to downstream tasks. In particular, their applicability to other tasks\nhas not been considered. In this study, we addressed the reduction of the\nimpact of typographic attacks on CLIP without changing the model parameters. To\nachieve this, we expand the idea of \"prefix learning\" and introduce our simple\nyet effective method: Defense-Prefix (DP), which inserts the DP token before a\nclass name to make words \"robust\" against typographic attacks. Our method can\nbe easily applied to downstream tasks, such as object detection, because the\nproposed method is independent of the model parameters. Our method\nsignificantly improves the accuracy of classification tasks for typographic\nattack datasets, while maintaining the zero-shot capabilities of the model. In\naddition, we leverage our proposed method for object detection, demonstrating\nits high applicability and effectiveness. The codes and datasets are available\nat https://github.com/azuma164/Defense-Prefix.\n","authors":["Hiroki Azuma","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2304.04512v3.pdf","comment":"ICCV2023 Workshop"},{"id":"http://arxiv.org/abs/2309.03113v1","updated":"2023-09-06T15:52:55Z","published":"2023-09-06T15:52:55Z","title":"Detecting Manufacturing Defects in PCBs via Data-Centric Machine\n Learning on Solder Paste Inspection Features","summary":" Automated detection of defects in Printed Circuit Board (PCB) manufacturing\nusing Solder Paste Inspection (SPI) and Automated Optical Inspection (AOI)\nmachines can help improve operational efficiency and significantly reduce the\nneed for manual intervention. In this paper, using SPI-extracted features of 6\nmillion pins, we demonstrate a data-centric approach to train Machine Learning\n(ML) models to detect PCB defects at three stages of PCB manufacturing. The 6\nmillion PCB pins correspond to 2 million components that belong to 15,387 PCBs.\nUsing a base extreme gradient boosting (XGBoost) ML model, we iterate on the\ndata pre-processing step to improve detection performance. Combining pin-level\nSPI features using component and PCB IDs, we developed training instances also\nat the component and PCB level. This allows the ML model to capture any\ninter-pin, inter-component, or spatial effects that may not be apparent at the\npin level. Models are trained at the pin, component, and PCB levels, and the\ndetection results from the different models are combined to identify defective\ncomponents.\n","authors":["Jubilee Prasad-Rao","Roohollah Heidary","Jesse Williams"],"pdf_url":"https://arxiv.org/pdf/2309.03113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03110v1","updated":"2023-09-06T15:47:33Z","published":"2023-09-06T15:47:33Z","title":"Do We Still Need Non-Maximum Suppression? Accurate Confidence Estimates\n and Implicit Duplication Modeling with IoU-Aware Calibration","summary":" Object detectors are at the heart of many semi- and fully autonomous decision\nsystems and are poised to become even more indispensable. They are, however,\nstill lacking in accessibility and can sometimes produce unreliable\npredictions. Especially concerning in this regard are the -- essentially\nhand-crafted -- non-maximum suppression algorithms that lead to an obfuscated\nprediction process and biased confidence estimates. We show that we can\neliminate classic NMS-style post-processing by using IoU-aware calibration.\nIoU-aware calibration is a conditional Beta calibration; this makes it\nparallelizable with no hyper-parameters. Instead of arbitrary cutoffs or\ndiscounts, it implicitly accounts for the likelihood of each detection being a\nduplicate and adjusts the confidence score accordingly, resulting in\nempirically based precision estimates for each detection. Our extensive\nexperiments on diverse detection architectures show that the proposed IoU-aware\ncalibration can successfully model duplicate detections and improve\ncalibration. Compared to the standard sequential NMS and calibration approach,\nour joint modeling can deliver performance gains over the best NMS-based\nalternative while producing consistently better-calibrated confidence\npredictions with less complexity. The\n\\hyperlink{https://github.com/Blueblue4/IoU-AwareCalibration}{code} for all our\nexperiments is publicly available.\n","authors":["Johannes Gilg","Torben Teepe","Fabian Herzog","Philipp Wolters","Gerhard Rigoll"],"pdf_url":"https://arxiv.org/pdf/2309.03110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03100v1","updated":"2023-09-06T15:40:33Z","published":"2023-09-06T15:40:33Z","title":"FArMARe: a Furniture-Aware Multi-task methodology for Recommending\n Apartments based on the user interests","summary":" Nowadays, many people frequently have to search for new accommodation\noptions. Searching for a suitable apartment is a time-consuming process,\nespecially because visiting them is often mandatory to assess the truthfulness\nof the advertisements found on the Web. While this process could be alleviated\nby visiting the apartments in the metaverse, the Web-based recommendation\nplatforms are not suitable for the task. To address this shortcoming, in this\npaper, we define a new problem called text-to-apartment recommendation, which\nrequires ranking the apartments based on their relevance to a textual query\nexpressing the user's interests. To tackle this problem, we introduce FArMARe,\na multi-task approach that supports cross-modal contrastive training with a\nfurniture-aware objective. Since public datasets related to indoor scenes do\nnot contain detailed descriptions of the furniture, we collect and annotate a\ndataset comprising more than 6000 apartments. A thorough experimentation with\nthree different methods and two raw feature extraction procedures reveals the\neffectiveness of FArMARe in dealing with the problem at hand.\n","authors":["Ali Abdari","Alex Falcon","Giuseppe Serra"],"pdf_url":"https://arxiv.org/pdf/2309.03100v1.pdf","comment":"accepted for presentation at the ICCV2023 CV4Metaverse workshop"},{"id":"http://arxiv.org/abs/2309.03072v1","updated":"2023-09-06T15:19:04Z","published":"2023-09-06T15:19:04Z","title":"Character Queries: A Transformer-based Approach to On-Line Handwritten\n Character Segmentation","summary":" On-line handwritten character segmentation is often associated with\nhandwriting recognition and even though recognition models include mechanisms\nto locate relevant positions during the recognition process, it is typically\ninsufficient to produce a precise segmentation. Decoupling the segmentation\nfrom the recognition unlocks the potential to further utilize the result of the\nrecognition. We specifically focus on the scenario where the transcription is\nknown beforehand, in which case the character segmentation becomes an\nassignment problem between sampling points of the stylus trajectory and\ncharacters in the text. Inspired by the $k$-means clustering algorithm, we view\nit from the perspective of cluster assignment and present a Transformer-based\narchitecture where each cluster is formed based on a learned character query in\nthe Transformer decoder block. In order to assess the quality of our approach,\nwe create character segmentation ground truths for two popular on-line\nhandwriting datasets, IAM-OnDB and HANDS-VNOnDB, and evaluate multiple methods\non them, demonstrating that our approach achieves the overall best results.\n","authors":["Michael Jungo","Beat Wolf","Andrii Maksai","Claudiu Musat","Andreas Fischer"],"pdf_url":"https://arxiv.org/pdf/2309.03072v1.pdf","comment":"ICDAR 2023 Best Student Paper Award. Code available at\n https://github.com/jungomi/character-queries"},{"id":"http://arxiv.org/abs/2203.14806v2","updated":"2023-09-06T15:13:07Z","published":"2022-03-28T14:44:52Z","title":"Extraction of Visual Information to Predict Crowdfunding Success","summary":" Researchers have increasingly turned to crowdfunding platforms to gain\ninsights into entrepreneurial activity and dynamics. While previous studies\nhave explored various factors influencing crowdfunding success, such as\ntechnology, communication, and marketing strategies, the role of visual\nelements that can be automatically extracted from images has received less\nattention. This is surprising, considering that crowdfunding platforms\nemphasize the importance of attention-grabbing and high-resolution images, and\nprevious research has shown that image characteristics can significantly impact\nproduct evaluations. Indeed, a comprehensive review of empirical articles (n =\n202) that utilized Kickstarter data, focusing on the incorporation of visual\ninformation in their analyses. Our findings reveal that only 29.70% controlled\nfor the number of images, and less than 12% considered any image details. In\nthis manuscript, we review the literature on image processing and its relevance\nto the business domain, highlighting two types of visual variables: visual\ncounts (number of pictures and number of videos) and image details. Building\nupon previous work that discussed the role of color, composition and\nfigure-ground relationships, we introduce visual scene elements that have not\nyet been explored in crowdfunding, including the number of faces, the number of\nconcepts depicted, and the ease of identifying those concepts. To demonstrate\nthe predictive value of visual counts and image details, we analyze Kickstarter\ndata. Our results highlight that visual count features are two of the top three\npredictors of success. Our results also show that simple image detail features\nsuch as color matter a lot, and our proposed measures of visual scene elements\ncan also be useful. We supplement our article with R and Python codes that help\nauthors extract image details (https://osf.io/ujnzp/).\n","authors":["S. J. Blanchard","T. J. Noseworthy","E. Pancer","M. Poole"],"pdf_url":"https://arxiv.org/pdf/2203.14806v2.pdf","comment":"32 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.03064v1","updated":"2023-09-06T15:07:23Z","published":"2023-09-06T15:07:23Z","title":"A Multimodal Analysis of Influencer Content on Twitter","summary":" Influencer marketing involves a wide range of strategies in which brands\ncollaborate with popular content creators (i.e., influencers) to leverage their\nreach, trust, and impact on their audience to promote and endorse products or\nservices. Because followers of influencers are more likely to buy a product\nafter receiving an authentic product endorsement rather than an explicit direct\nproduct promotion, the line between personal opinions and commercial content\npromotion is frequently blurred. This makes automatic detection of regulatory\ncompliance breaches related to influencer advertising (e.g., misleading\nadvertising or hidden sponsorships) particularly difficult. In this work, we\n(1) introduce a new Twitter (now X) dataset consisting of 15,998 influencer\nposts mapped into commercial and non-commercial categories for assisting in the\nautomatic detection of commercial influencer content; (2) experiment with an\nextensive set of predictive models that combine text and visual information\nshowing that our proposed cross-attention approach outperforms state-of-the-art\nmultimodal models; and (3) conduct a thorough analysis of strengths and\nlimitations of our models. We show that multimodal modeling is useful for\nidentifying commercial posts, reducing the amount of false positives, and\ncapturing relevant context that aids in the discovery of undisclosed commercial\nposts.\n","authors":["Danae Sánchez Villegas","Catalina Goanta","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2309.03064v1.pdf","comment":"Accepted at AACL 2023"},{"id":"http://arxiv.org/abs/2309.03063v1","updated":"2023-09-06T15:05:04Z","published":"2023-09-06T15:05:04Z","title":"Prompt-based All-in-One Image Restoration using CNNs and Transformer","summary":" Image restoration aims to recover the high-quality images from their degraded\nobservations. Since most existing methods have been dedicated into single\ndegradation removal, they may not yield optimal results on other types of\ndegradations, which do not satisfy the applications in real world scenarios. In\nthis paper, we propose a novel data ingredient-oriented approach that leverages\nprompt-based learning to enable a single model to efficiently tackle multiple\nimage degradation tasks. Specifically, we utilize a encoder to capture features\nand introduce prompts with degradation-specific information to guide the\ndecoder in adaptively recovering images affected by various degradations. In\norder to model the local invariant properties and non-local information for\nhigh-quality image restoration, we combined CNNs operations and Transformers.\nSimultaneously, we made several key designs in the Transformer blocks\n(multi-head rearranged attention with prompts and simple-gate feed-forward\nnetwork) to reduce computational requirements and selectively determines what\ninformation should be persevered to facilitate efficient recovery of\npotentially sharp images. Furthermore, we incorporate a feature fusion\nmechanism further explores the multi-scale information to improve the\naggregated features. The resulting tightly interlinked hierarchy architecture,\nnamed as CAPTNet, despite being designed to handle different types of\ndegradations, extensive experiments demonstrate that our method performs\ncompetitively to the task-specific algorithms.\n","authors":["Hu Gao","Jing Yang","Ning Wang","Jingfan Yang","Ying Zhang","Depeng Dang"],"pdf_url":"https://arxiv.org/pdf/2309.03063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03049v1","updated":"2023-09-06T14:43:58Z","published":"2023-09-06T14:43:58Z","title":"Adaptive Growth: Real-time CNN Layer Expansion","summary":" Deep Neural Networks (DNNs) have shown unparalleled achievements in numerous\napplications, reflecting their proficiency in managing vast data sets. Yet,\ntheir static structure limits their adaptability in ever-changing environments.\nThis research presents a new algorithm that allows the convolutional layer of a\nConvolutional Neural Network (CNN) to dynamically evolve based on data input,\nwhile still being seamlessly integrated into existing DNNs. Instead of a rigid\narchitecture, our approach iteratively introduces kernels to the convolutional\nlayer, gauging its real-time response to varying data. This process is refined\nby evaluating the layer's capacity to discern image features, guiding its\ngrowth. Remarkably, our unsupervised method has outstripped its supervised\ncounterparts across diverse datasets like MNIST, Fashion-MNIST, CIFAR-10, and\nCIFAR-100. It also showcases enhanced adaptability in transfer learning\nscenarios. By introducing a data-driven model scalability strategy, we are\nfilling a void in deep learning, leading to more flexible and efficient DNNs\nsuited for dynamic settings.\nCode:(https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version).\n","authors":["Yunjie Zhu","Yunhao Chen"],"pdf_url":"https://arxiv.org/pdf/2309.03049v1.pdf","comment":"Code:\n https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version"},{"id":"http://arxiv.org/abs/2309.03048v1","updated":"2023-09-06T14:43:22Z","published":"2023-09-06T14:43:22Z","title":"Exploring Semantic Consistency in Unpaired Image Translation to Generate\n Data for Surgical Applications","summary":" In surgical computer vision applications, obtaining labeled training data is\nchallenging due to data-privacy concerns and the need for expert annotation.\nUnpaired image-to-image translation techniques have been explored to\nautomatically generate large annotated datasets by translating synthetic images\nto the realistic domain. However, preserving the structure and semantic\nconsistency between the input and translated images presents significant\nchallenges, mainly when there is a distributional mismatch in the semantic\ncharacteristics of the domains. This study empirically investigates unpaired\nimage translation methods for generating suitable data in surgical\napplications, explicitly focusing on semantic consistency. We extensively\nevaluate various state-of-the-art image translation models on two challenging\nsurgical datasets and downstream semantic segmentation tasks. We find that a\nsimple combination of structural-similarity loss and contrastive learning\nyields the most promising results. Quantitatively, we show that the data\ngenerated with this approach yields higher semantic consistency and can be used\nmore effectively as training data.\n","authors":["Danush Kumar Venkatesh","Dominik Rivior","Micha Pfeiffer","Fiona Kolbinger","Marius Distler","Jürgen Weitz","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2309.03048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03047v1","updated":"2023-09-06T14:41:55Z","published":"2023-09-06T14:41:55Z","title":"Combining pre-trained Vision Transformers and CIDER for Out Of Domain\n Detection","summary":" Out-of-domain (OOD) detection is a crucial component in industrial\napplications as it helps identify when a model encounters inputs that are\noutside the training distribution. Most industrial pipelines rely on\npre-trained models for downstream tasks such as CNN or Vision Transformers.\nThis paper investigates the performance of those models on the task of\nout-of-domain detection. Our experiments demonstrate that pre-trained\ntransformers models achieve higher detection performance out of the box.\nFurthermore, we show that pre-trained ViT and CNNs can be combined with\nrefinement methods such as CIDER to improve their OOD detection performance\neven more. Our results suggest that transformers are a promising approach for\nOOD detection and set a stronger baseline for this task in many contexts\n","authors":["Grégor Jouet","Clément Duhart","Francis Rousseaux","Julio Laborde","Cyril de Runz"],"pdf_url":"https://arxiv.org/pdf/2309.03047v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10293v2","updated":"2023-09-06T14:21:45Z","published":"2023-05-17T15:27:35Z","title":"Infinite Class Mixup","summary":" Mixup is a widely adopted strategy for training deep networks, where\nadditional samples are augmented by interpolating inputs and labels of training\npairs. Mixup has shown to improve classification performance, network\ncalibration, and out-of-distribution generalisation. While effective, a\ncornerstone of Mixup, namely that networks learn linear behaviour patterns\nbetween classes, is only indirectly enforced since the output interpolation is\nperformed at the probability level. This paper seeks to address this limitation\nby mixing the classifiers directly instead of mixing the labels for each mixed\npair. We propose to define the target of each augmented sample as a uniquely\nnew classifier, whose parameters are a linear interpolation of the classifier\nvectors of the input pair. The space of all possible classifiers is continuous\nand spans all interpolations between classifier pairs. To make optimisation\ntractable, we propose a dual-contrastive Infinite Class Mixup loss, where we\ncontrast the classifier of a mixed pair to both the classifiers and the\npredicted outputs of other mixed pairs in a batch. Infinite Class Mixup is\ngeneric in nature and applies to many variants of Mixup. Empirically, we show\nthat it outperforms standard Mixup and variants such as RegMixup and Remix on\nbalanced, long-tailed, and data-constrained benchmarks, highlighting its broad\napplicability.\n","authors":["Thomas Mensink","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2305.10293v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2309.03031v1","updated":"2023-09-06T14:17:49Z","published":"2023-09-06T14:17:49Z","title":"MCM: Multi-condition Motion Synthesis Framework for Multi-scenario","summary":" The objective of the multi-condition human motion synthesis task is to\nincorporate diverse conditional inputs, encompassing various forms like text,\nmusic, speech, and more. This endows the task with the capability to adapt\nacross multiple scenarios, ranging from text-to-motion and music-to-dance,\namong others. While existing research has primarily focused on single\nconditions, the multi-condition human motion generation remains underexplored.\nIn this paper, we address these challenges by introducing MCM, a novel paradigm\nfor motion synthesis that spans multiple scenarios under diverse conditions.\nThe MCM framework is able to integrate with any DDPM-like diffusion model to\naccommodate multi-conditional information input while preserving its generative\ncapabilities. Specifically, MCM employs two-branch architecture consisting of a\nmain branch and a control branch. The control branch shares the same structure\nas the main branch and is initialized with the parameters of the main branch,\neffectively maintaining the generation ability of the main branch and\nsupporting multi-condition input. We also introduce a Transformer-based\ndiffusion model MWNet (DDPM-like) as our main branch that can capture the\nspatial complexity and inter-joint correlations in motion sequences through a\nchannel-dimension self-attention module. Quantitative comparisons demonstrate\nthat our approach achieves SoTA results in both text-to-motion and competitive\nresults in music-to-dance tasks, comparable to task-specific methods.\nFurthermore, the qualitative evaluation shows that MCM not only streamlines the\nadaptation of methodologies originally designed for text-to-motion tasks to\ndomains like music-to-dance and speech-to-gesture, eliminating the need for\nextensive network re-configurations but also enables effective multi-condition\nmodal control, realizing \"once trained is motion need\".\n","authors":["Zeyu Ling","Bo Han","Yongkang Wong","Mohan Kangkanhalli","Weidong Geng"],"pdf_url":"https://arxiv.org/pdf/2309.03031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08049v2","updated":"2023-09-06T14:13:54Z","published":"2022-11-15T11:01:12Z","title":"Forecasting Future Instance Segmentation with Learned Optical Flow and\n Warping","summary":" For an autonomous vehicle it is essential to observe the ongoing dynamics of\na scene and consequently predict imminent future scenarios to ensure safety to\nitself and others. This can be done using different sensors and modalities. In\nthis paper we investigate the usage of optical flow for predicting future\nsemantic segmentations. To do so we propose a model that forecasts flow fields\nautoregressively. Such predictions are then used to guide the inference of a\nlearned warping function that moves instance segmentations on to future frames.\nResults on the Cityscapes dataset demonstrate the effectiveness of optical-flow\nmethods.\n","authors":["Andrea Ciamarra","Federico Becattini","Lorenzo Seidenari","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2211.08049v2.pdf","comment":"Paper published as Poster at ICIAP21"},{"id":"http://arxiv.org/abs/2309.03020v1","updated":"2023-09-06T14:02:55Z","published":"2023-09-06T14:02:55Z","title":"SEAL: A Framework for Systematic Evaluation of Real-World\n Super-Resolution","summary":" Real-world Super-Resolution (real-SR) methods focus on dealing with diverse\nreal-world images and have attracted increasing attention in recent years. The\nkey idea is to use a complex and high-order degradation model to mimic\nreal-world degradations. Although they have achieved impressive results in\nvarious scenarios, they are faced with the obstacle of evaluation. Currently,\nthese methods are only assessed by their average performance on a small set of\ndegradation cases randomly selected from a large space, which fails to provide\na comprehensive understanding of their overall performance and often yields\nbiased results. To overcome the limitation in evaluation, we propose SEAL, a\nframework for systematic evaluation of real-SR. In particular, we cluster the\nextensive degradation space to create a set of representative degradation\ncases, which serves as a comprehensive test set. Next, we propose a\ncoarse-to-fine evaluation protocol to measure the distributed and relative\nperformance of real-SR methods on the test set. The protocol incorporates two\nnew metrics: acceptance rate (AR) and relative performance ratio (RPR), derived\nfrom an acceptance line and an excellence line. Under SEAL, we benchmark\nexisting real-SR methods, obtain new observations and insights into their\nperformance, and develop a new strong baseline. We consider SEAL as the first\nstep towards creating an unbiased and comprehensive evaluation platform, which\ncan promote the development of real-SR.\n","authors":["Wenlong Zhang","Xiaohui Li","Xiangyu Chen","Yu Qiao","Xiao-Ming Wu","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.03020v1.pdf","comment":"The source code is available at https://github.com/XPixelGroup/SEAL"},{"id":"http://arxiv.org/abs/2309.03008v1","updated":"2023-09-06T13:54:31Z","published":"2023-09-06T13:54:31Z","title":"Sparse 3D Reconstruction via Object-Centric Ray Sampling","summary":" We propose a novel method for 3D object reconstruction from a sparse set of\nviews captured from a 360-degree calibrated camera rig. We represent the object\nsurface through a hybrid model that uses both an MLP-based neural\nrepresentation and a triangle mesh. A key contribution in our work is a novel\nobject-centric sampling scheme of the neural representation, where rays are\nshared among all views. This efficiently concentrates and reduces the number of\nsamples used to update the neural model at each iteration. This sampling scheme\nrelies on the mesh representation to ensure also that samples are\nwell-distributed along its normals. The rendering is then performed efficiently\nby a differentiable renderer. We demonstrate that this sampling scheme results\nin a more effective training of the neural representation, does not require the\nadditional supervision of segmentation masks, yields state of the art 3D\nreconstructions, and works with sparse views on the Google's Scanned Objects,\nTank and Temples and MVMC Car datasets.\n","authors":["Llukman Cerkezi","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2309.03008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02999v1","updated":"2023-09-06T13:43:27Z","published":"2023-09-06T13:43:27Z","title":"Vote2Cap-DETR++: Decoupling Localization and Describing for End-to-End\n 3D Dense Captioning","summary":" 3D dense captioning requires a model to translate its understanding of an\ninput 3D scene into several captions associated with different object regions.\nExisting methods adopt a sophisticated \"detect-then-describe\" pipeline, which\nbuilds explicit relation modules upon a 3D detector with numerous hand-crafted\ncomponents. While these methods have achieved initial success, the cascade\npipeline tends to accumulate errors because of duplicated and inaccurate box\nestimations and messy 3D scenes. In this paper, we first propose Vote2Cap-DETR,\na simple-yet-effective transformer framework that decouples the decoding\nprocess of caption generation and object localization through parallel\ndecoding. Moreover, we argue that object localization and description\ngeneration require different levels of scene understanding, which could be\nchallenging for a shared set of queries to capture. To this end, we propose an\nadvanced version, Vote2Cap-DETR++, which decouples the queries into\nlocalization and caption queries to capture task-specific features.\nAdditionally, we introduce the iterative spatial refinement strategy to vote\nqueries for faster convergence and better localization performance. We also\ninsert additional spatial information to the caption head for more accurate\ndescriptions. Without bells and whistles, extensive experiments on two commonly\nused datasets, ScanRefer and Nr3D, demonstrate Vote2Cap-DETR and\nVote2Cap-DETR++ surpass conventional \"detect-then-describe\" methods by a large\nmargin. Codes will be made available at\nhttps://github.com/ch3cook-fdu/Vote2Cap-DETR.\n","authors":["Sijin Chen","Hongyuan Zhu","Mingsheng Li","Xin Chen","Peng Guo","Yinjie Lei","Gang Yu","Taihao Li","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2309.02999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.09060v3","updated":"2023-09-06T13:42:28Z","published":"2022-09-19T14:50:48Z","title":"Deep Metric Learning with Chance Constraints","summary":" Deep metric learning (DML) aims to minimize empirical expected loss of the\npairwise intra-/inter- class proximity violations in the embedding space. We\nrelate DML to feasibility problem of finite chance constraints. We show that\nminimizer of proxy-based DML satisfies certain chance constraints, and that the\nworst case generalization performance of the proxy-based methods can be\ncharacterized by the radius of the smallest ball around a class proxy to cover\nthe entire domain of the corresponding class samples, suggesting multiple\nproxies per class helps performance. To provide a scalable algorithm as well as\nexploiting more proxies, we consider the chance constraints implied by the\nminimizers of proxy-based DML instances and reformulate DML as finding a\nfeasible point in intersection of such constraints, resulting in a problem to\nbe approximately solved by iterative projections. Simply put, we repeatedly\ntrain a regularized proxy-based loss and re-initialize the proxies with the\nembeddings of the deliberately selected new samples. We applied our method with\n4 well-accepted DML losses and show the effectiveness with extensive\nevaluations on 4 popular DML benchmarks. Code is available at:\nhttps://github.com/yetigurbuz/ccp-dml\n","authors":["Yeti Z. Gurbuz","Ogul Can","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2209.09060v3.pdf","comment":"Accepted as a conference paper at IEEE/CVF Winter Conference on\n Applications of Computer Vision (WACV) 2024"},{"id":"http://arxiv.org/abs/2308.10743v2","updated":"2023-09-06T13:39:40Z","published":"2023-08-21T14:16:36Z","title":"Enhancing Adversarial Attacks: The Similar Target Method","summary":" Deep neural networks are vulnerable to adversarial examples, posing a threat\nto the models' applications and raising security concerns. An intriguing\nproperty of adversarial examples is their strong transferability. Several\nmethods have been proposed to enhance transferability, including ensemble\nattacks which have demonstrated their efficacy. However, prior approaches\nsimply average logits, probabilities, or losses for model ensembling, lacking a\ncomprehensive analysis of how and why model ensembling significantly improves\ntransferability. In this paper, we propose a similar targeted attack method\nnamed Similar Target~(ST). By promoting cosine similarity between the gradients\nof each model, our method regularizes the optimization direction to\nsimultaneously attack all surrogate models. This strategy has been proven to\nenhance generalization ability. Experimental results on ImageNet validate the\neffectiveness of our approach in improving adversarial transferability. Our\nmethod outperforms state-of-the-art attackers on 18 discriminative classifiers\nand adversarially trained models.\n","authors":["Shuo Zhang","Ziruo Wang","Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.10743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02995v1","updated":"2023-09-06T13:36:59Z","published":"2023-09-06T13:36:59Z","title":"Continual Evidential Deep Learning for Out-of-Distribution Detection","summary":" Uncertainty-based deep learning models have attracted a great deal of\ninterest for their ability to provide accurate and reliable predictions.\nEvidential deep learning stands out achieving remarkable performance in\ndetecting out-of-distribution (OOD) data with a single deterministic neural\nnetwork. Motivated by this fact, in this paper we propose the integration of an\nevidential deep learning method into a continual learning framework in order to\nperform simultaneously incremental object classification and OOD detection.\nMoreover, we analyze the ability of vacuity and dissonance to differentiate\nbetween in-distribution data belonging to old classes and OOD data. The\nproposed method, called CEDL, is evaluated on CIFAR-100 considering two\nsettings consisting of 5 and 10 tasks, respectively. From the obtained results,\nwe could appreciate that the proposed method, in addition to provide comparable\nresults in object classification with respect to the baseline, largely\noutperforms OOD detection compared to several posthoc methods on three\nevaluation metrics: AUROC, AUPR and FPR95.\n","authors":["Eduardo Aguilar","Bogdan Raducanu","Petia Radeva","Joost Van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2309.02995v1.pdf","comment":"Accepted at Visual Continual Learning workshop (ICCV2023)"},{"id":"http://arxiv.org/abs/2309.02975v1","updated":"2023-09-06T13:16:41Z","published":"2023-09-06T13:16:41Z","title":"FishMOT: A Simple and Effective Method for Fish Tracking Based on IoU\n Matching","summary":" The tracking of various fish species plays a profoundly significant role in\nunderstanding the behavior of individual fish and their groups. Present\ntracking methods suffer from issues of low accuracy or poor robustness. In\norder to address these concerns, this paper proposes a novel tracking approach,\nnamed FishMOT (Fish Multiple Object Tracking). This method combines object\ndetection techniques with the IoU matching algorithm, thereby achieving\nefficient, precise, and robust fish detection and tracking. Diverging from\nother approaches, this method eliminates the need for multiple feature\nextractions and identity assignments for each individual, instead directly\nutilizing the output results of the detector for tracking, thereby\nsignificantly reducing computational time and storage space. Furthermore, this\nmethod imposes minimal requirements on factors such as video quality and\nvariations in individual appearance. As long as the detector can accurately\nlocate and identify fish, effective tracking can be achieved. This approach\nenhances robustness and generalizability. Moreover, the algorithm employed in\nthis method addresses the issue of missed detections without relying on complex\nfeature matching or graph optimization algorithms. This contributes to improved\naccuracy and reliability. Experimental trials were conducted in the open-source\nvideo dataset provided by idtracker.ai, and comparisons were made with\nstate-of-the-art detector-based multi-object tracking methods. Additionally,\ncomparisons were made with idtracker.ai and TRex, two tools that demonstrate\nexceptional performance in the field of animal tracking. The experimental\nresults demonstrate that the proposed method outperforms other approaches in\nvarious evaluation metrics, exhibiting faster speed and lower memory\nrequirements. The source codes and pre-trained models are available at:\nhttps://github.com/gakkistar/FishMOT\n","authors":["Shuo Liu","Lulu Han","Xiaoyang Liu","Junli Ren","Fang Wang","Yuanshan Lin"],"pdf_url":"https://arxiv.org/pdf/2309.02975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01860v2","updated":"2023-09-06T13:06:45Z","published":"2023-09-04T23:31:29Z","title":"Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition\n and Translation","summary":" In this paper, we devise a mechanism for the addition of multi-modal\ninformation with an existing pipeline for continuous sign language recognition\nand translation. In our procedure, we have incorporated optical flow\ninformation with RGB images to enrich the features with movement-related\ninformation. This work studies the feasibility of such modality inclusion using\na cross-modal encoder. The plugin we have used is very lightweight and doesn't\nneed to include a separate feature extractor for the new modality in an\nend-to-end manner. We have applied the changes in both sign language\nrecognition and translation, improving the result in each case. We have\nevaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language\nrecognition and the RWTH-PHOENIX-2014T dataset for translation. On the\nrecognition task, our approach reduced the WER by 0.9, and on the translation\ntask, our approach increased most of the BLEU scores by ~0.6 on the test set.\n","authors":["Zaber Ibn Abdul Hakim","Rasman Mubtasim Swargo","Muhammad Abdullah Adnan"],"pdf_url":"https://arxiv.org/pdf/2309.01860v2.pdf","comment":"This version has some errors. Our schedule is packed, so we don't\n have enough time to correct it. We will share another work when we have time\n to fix this"},{"id":"http://arxiv.org/abs/2309.02965v1","updated":"2023-09-06T13:00:10Z","published":"2023-09-06T13:00:10Z","title":"Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction","summary":" Reconstructing both objects and hands in 3D from a single RGB image is\ncomplex. Existing methods rely on manually defined hand-object constraints in\nEuclidean space, leading to suboptimal feature learning. Compared with\nEuclidean space, hyperbolic space better preserves the geometric properties of\nmeshes thanks to its exponentially-growing space distance, which amplifies the\ndifferences between the features based on similarity. In this work, we propose\nthe first precise hand-object reconstruction method in hyperbolic space, namely\nDynamic Hyperbolic Attention Network (DHANet), which leverages intrinsic\nproperties of hyperbolic space to learn representative features. Our method\nthat projects mesh and image features into a unified hyperbolic space includes\ntwo modules, ie. dynamic hyperbolic graph convolution and image-attention\nhyperbolic graph convolution. With these two modules, our method learns mesh\nfeatures with rich geometry-image multi-modal information and models better\nhand-object interaction. Our method provides a promising alternative for fine\nhand-object reconstruction in hyperbolic space. Extensive experiments on three\npublic datasets demonstrate that our method outperforms most state-of-the-art\nmethods.\n","authors":["Zhiying Leng","Shun-Cheng Wu","Mahdi Saleh","Antonio Montanaro","Hao Yu","Yin Wang","Nassir Navab","Xiaohui Liang","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2309.02965v1.pdf","comment":"Accpeted by ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02964v1","updated":"2023-09-06T12:59:52Z","published":"2023-09-06T12:59:52Z","title":"Hierarchical-level rain image generative model based on GAN","summary":" Autonomous vehicles are exposed to various weather during operation, which is\nlikely to trigger the performance limitations of the perception system, leading\nto the safety of the intended functionality (SOTIF) problems. To efficiently\ngenerate data for testing the performance of visual perception algorithms under\nvarious weather conditions, a hierarchical-level rain image generative model,\nrain conditional CycleGAN (RCCycleGAN), is constructed. RCCycleGAN is based on\nthe generative adversarial network (GAN) and can generate images of light,\nmedium, and heavy rain. Different rain intensities are introduced as labels in\nconditional GAN (CGAN). Meanwhile, the model structure is optimized and the\ntraining strategy is adjusted to alleviate the problem of mode collapse. In\naddition, natural rain images of different intensities are collected and\nprocessed for model training and validation. Compared with the two baseline\nmodels, CycleGAN and DerainCycleGAN, the peak signal-to-noise ratio (PSNR) of\nRCCycleGAN on the test dataset is improved by 2.58 dB and 0.74 dB, and the\nstructural similarity (SSIM) is improved by 18% and 8%, respectively. The\nablation experiments are also carried out to validate the effectiveness of the\nmodel tuning.\n","authors":["Zhenyuan Liu","Tong Jia","Xingyu Xing","Jianfeng Wu","Junyi Chen"],"pdf_url":"https://arxiv.org/pdf/2309.02964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02961v1","updated":"2023-09-06T12:57:00Z","published":"2023-09-06T12:57:00Z","title":"Indoor Localization Using Radio, Vision and Audio Sensors: Real-Life\n Data Validation and Discussion","summary":" This paper investigates indoor localization methods using radio, vision, and\naudio sensors, respectively, in the same environment. The evaluation is based\non state-of-the-art algorithms and uses a real-life dataset. More specifically,\nwe evaluate a machine learning algorithm for radio-based localization with\nmassive MIMO technology, an ORB-SLAM3 algorithm for vision-based localization\nwith an RGB-D camera, and an SFS2 algorithm for audio-based localization with\nmicrophone arrays. Aspects including localization accuracy, reliability,\ncalibration requirements, and potential system complexity are discussed to\nanalyze the advantages and limitations of using different sensors for indoor\nlocalization tasks. The results can serve as a guideline and basis for further\ndevelopment of robust and high-precision multi-sensory localization systems,\ne.g., through sensor fusion and context and environment-aware adaptation.\n","authors":["Ilayda Yaman","Guoda Tian","Erik Tegler","Patrik Persson","Nikhil Challa","Fredrik Tufvesson","Ove Edfors","Kalle Astrom","Steffen Malkowsky","Liang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02961v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.02959v1","updated":"2023-09-06T12:55:59Z","published":"2023-09-06T12:55:59Z","title":"A Non-Invasive Interpretable NAFLD Diagnostic Method Combining TCM\n Tongue Features","summary":" Non-alcoholic fatty liver disease (NAFLD) is a clinicopathological syndrome\ncharacterized by hepatic steatosis resulting from the exclusion of alcohol and\nother identifiable liver-damaging factors. It has emerged as a leading cause of\nchronic liver disease worldwide. Currently, the conventional methods for NAFLD\ndetection are expensive and not suitable for users to perform daily\ndiagnostics. To address this issue, this study proposes a non-invasive and\ninterpretable NAFLD diagnostic method, the required user-provided indicators\nare only Gender, Age, Height, Weight, Waist Circumference, Hip Circumference,\nand tongue image. This method involves merging patients' physiological\nindicators with tongue features, which are then input into a fusion network\nnamed SelectorNet. SelectorNet combines attention mechanisms with feature\nselection mechanisms, enabling it to autonomously learn the ability to select\nimportant features. The experimental results show that the proposed method\nachieves an accuracy of 77.22\\% using only non-invasive data, and it also\nprovides compelling interpretability matrices. This study contributes to the\nearly diagnosis of NAFLD and the intelligent advancement of TCM tongue\ndiagnosis. The project in this paper is available at:\nhttps://github.com/cshan-github/SelectorNet.\n","authors":["Shan Cao","Qunsheng Ruan","Qingfeng Wu"],"pdf_url":"https://arxiv.org/pdf/2309.02959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02954v1","updated":"2023-09-06T12:43:18Z","published":"2023-09-06T12:43:18Z","title":"M3D-NCA: Robust 3D Segmentation with Built-in Quality Control","summary":" Medical image segmentation relies heavily on large-scale deep learning\nmodels, such as UNet-based architectures. However, the real-world utility of\nsuch models is limited by their high computational requirements, which makes\nthem impractical for resource-constrained environments such as primary care\nfacilities and conflict zones. Furthermore, shifts in the imaging domain can\nrender these models ineffective and even compromise patient safety if such\nerrors go undetected. To address these challenges, we propose M3D-NCA, a novel\nmethodology that leverages Neural Cellular Automata (NCA) segmentation for 3D\nmedical images using n-level patchification. Moreover, we exploit the variance\nin M3D-NCA to develop a novel quality metric which can automatically detect\nerrors in the segmentation process of NCAs. M3D-NCA outperforms the two\nmagnitudes larger UNet models in hippocampus and prostate segmentation by 2%\nDice and can be run on a Raspberry Pi 4 Model B (2GB RAM). This highlights the\npotential of M3D-NCA as an effective and efficient alternative for medical\nimage segmentation in resource-constrained environments.\n","authors":["John Kalkhof","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2309.02954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15692v2","updated":"2023-09-06T12:22:24Z","published":"2022-11-28T19:00:02Z","title":"H3WB: Human3.6M 3D WholeBody Dataset and Benchmark","summary":" We present a benchmark for 3D human whole-body pose estimation, which\ninvolves identifying accurate 3D keypoints on the entire human body, including\nface, hands, body, and feet. Currently, the lack of a fully annotated and\naccurate 3D whole-body dataset results in deep networks being trained\nseparately on specific body parts, which are combined during inference. Or they\nrely on pseudo-groundtruth provided by parametric body models which are not as\naccurate as detection based methods. To overcome these issues, we introduce the\nHuman3.6M 3D WholeBody (H3WB) dataset, which provides whole-body annotations\nfor the Human3.6M dataset using the COCO Wholebody layout. H3WB comprises 133\nwhole-body keypoint annotations on 100K images, made possible by our new\nmulti-view pipeline. We also propose three tasks: i) 3D whole-body pose lifting\nfrom 2D complete whole-body pose, ii) 3D whole-body pose lifting from 2D\nincomplete whole-body pose, and iii) 3D whole-body pose estimation from a\nsingle RGB image. Additionally, we report several baselines from popular\nmethods for these tasks. Furthermore, we also provide automated 3D whole-body\nannotations of TotalCapture and experimentally show that when used with H3WB it\nhelps to improve the performance. Code and dataset is available at\nhttps://github.com/wholebody3d/wholebody3d\n","authors":["Yue Zhu","Nermin Samet","David Picard"],"pdf_url":"https://arxiv.org/pdf/2211.15692v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2301.09164v2","updated":"2023-09-06T12:15:06Z","published":"2023-01-22T17:12:58Z","title":"Unifying Synergies between Self-supervised Learning and Dynamic\n Computation","summary":" Computationally expensive training strategies make self-supervised learning\n(SSL) impractical for resource constrained industrial settings. Techniques like\nknowledge distillation (KD), dynamic computation (DC), and pruning are often\nused to obtain a lightweightmodel, which usually involves multiple epochs of\nfine-tuning (or distilling steps) of a large pre-trained model, making it more\ncomputationally challenging. In this work we present a novel perspective on the\ninterplay between SSL and DC paradigms. In particular, we show that it is\nfeasible to simultaneously learn a dense and gated sub-network from scratch in\na SSL setting without any additional fine-tuning or pruning steps. The\nco-evolution during pre-training of both dense and gated encoder offers a good\naccuracy-efficiency trade-off and therefore yields a generic and multi-purpose\narchitecture for application specific industrial settings. Extensive\nexperiments on several image classification benchmarks including CIFAR-10/100,\nSTL-10 and ImageNet-100, demonstrate that the proposed training strategy\nprovides a dense and corresponding gated sub-network that achieves on-par\nperformance compared with the vanilla self-supervised setting, but at a\nsignificant reduction in computation in terms of FLOPs, under a range of target\nbudgets (td ).\n","authors":["Tarun Krishna","Ayush K Rai","Alexandru Drimbarean","Eric Arazo","Paul Albert","Alan F Smeaton","Kevin McGuinness","Noel E O'Connor"],"pdf_url":"https://arxiv.org/pdf/2301.09164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12452v2","updated":"2023-09-06T12:00:04Z","published":"2023-08-23T22:22:20Z","title":"ARF-Plus: Controlling Perceptual Factors in Artistic Radiance Fields for\n 3D Scene Stylization","summary":" The radiance fields style transfer is an emerging field that has recently\ngained popularity as a means of 3D scene stylization, thanks to the outstanding\nperformance of neural radiance fields in 3D reconstruction and view synthesis.\nWe highlight a research gap in radiance fields style transfer, the lack of\nsufficient perceptual controllability, motivated by the existing concept in the\n2D image style transfer. In this paper, we present ARF-Plus, a 3D neural style\ntransfer framework offering manageable control over perceptual factors, to\nsystematically explore the perceptual controllability in 3D scene stylization.\nFour distinct types of controls - color preservation control, (style pattern)\nscale control, spatial (selective stylization area) control, and depth\nenhancement control - are proposed and integrated into this framework. Results\nfrom real-world datasets, both quantitative and qualitative, show that the four\ntypes of controls in our ARF-Plus framework successfully accomplish their\ncorresponding perceptual controls when stylizing 3D scenes. These techniques\nwork well for individual style inputs as well as for the simultaneous\napplication of multiple styles within a scene. This unlocks a realm of\nlimitless possibilities, allowing customized modifications of stylization\neffects and flexible merging of the strengths of different styles, ultimately\nenabling the creation of novel and eye-catching stylistic effects on 3D scenes.\n","authors":["Wenzhao Li","Tianhao Wu","Fangcheng Zhong","Cengiz Oztireli"],"pdf_url":"https://arxiv.org/pdf/2308.12452v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02923v1","updated":"2023-09-06T11:33:25Z","published":"2023-09-06T11:33:25Z","title":"Patched Line Segment Learning for Vector Road Mapping","summary":" This paper presents a novel approach to computing vector road maps from\nsatellite remotely sensed images, building upon a well-defined Patched Line\nSegment (PaLiS) representation for road graphs that holds geometric\nsignificance. Unlike prevailing methods that derive road vector representations\nfrom satellite images using binary masks or keypoints, our method employs line\nsegments. These segments not only convey road locations but also capture their\norientations, making them a robust choice for representation. More precisely,\ngiven an input image, we divide it into non-overlapping patches and predict a\nsuitable line segment within each patch. This strategy enables us to capture\nspatial and structural cues from these patch-based line segments, simplifying\nthe process of constructing the road network graph without the necessity of\nadditional neural networks for connectivity. In our experiments, we demonstrate\nhow an effective representation of a road graph significantly enhances the\nperformance of vector road mapping on established benchmarks, without requiring\nextensive modifications to the neural network architecture. Furthermore, our\nmethod achieves state-of-the-art performance with just 6 GPU hours of training,\nleading to a substantial 32-fold reduction in training costs in terms of GPU\nhours.\n","authors":["Jiakun Xu","Bowen Xu","Gui-Song Xia","Liang Dong","Nan Xue"],"pdf_url":"https://arxiv.org/pdf/2309.02923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11126v3","updated":"2023-09-06T11:09:26Z","published":"2023-03-20T14:04:40Z","title":"Robustifying Token Attention for Vision Transformers","summary":" Despite the success of vision transformers (ViTs), they still suffer from\nsignificant drops in accuracy in the presence of common corruptions, such as\nnoise or blur. Interestingly, we observe that the attention mechanism of ViTs\ntends to rely on few important tokens, a phenomenon we call token overfocusing.\nMore critically, these tokens are not robust to corruptions, often leading to\nhighly diverging attention patterns. In this paper, we intend to alleviate this\noverfocusing issue and make attention more stable through two general\ntechniques: First, our Token-aware Average Pooling (TAP) module encourages the\nlocal neighborhood of each token to take part in the attention mechanism.\nSpecifically, TAP learns average pooling schemes for each token such that the\ninformation of potentially important tokens in the neighborhood can adaptively\nbe taken into account. Second, we force the output tokens to aggregate\ninformation from a diverse set of input tokens rather than focusing on just a\nfew by using our Attention Diversification Loss (ADL). We achieve this by\npenalizing high cosine similarity between the attention vectors of different\ntokens. In experiments, we apply our methods to a wide range of transformer\narchitectures and improve robustness significantly. For example, we improve\ncorruption robustness on ImageNet-C by 2.4% while improving accuracy by 0.4%\nbased on state-of-the-art robust architecture FAN. Also, when fine-tuning on\nsemantic segmentation tasks, we improve robustness on CityScapes-C by 2.4% and\nACDC by 3.0%. Our code is available at https://github.com/guoyongcs/TAPADL.\n","authors":["Yong Guo","David Stutz","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2303.11126v3.pdf","comment":"To appear in ICCV 2023"},{"id":"http://arxiv.org/abs/2309.02903v1","updated":"2023-09-06T10:52:57Z","published":"2023-09-06T10:52:57Z","title":"Towards Efficient Training with Negative Samples in Visual Tracking","summary":" Current state-of-the-art (SOTA) methods in visual object tracking often\nrequire extensive computational resources and vast amounts of training data,\nleading to a risk of overfitting. This study introduces a more efficient\ntraining strategy to mitigate overfitting and reduce computational\nrequirements. We balance the training process with a mix of negative and\npositive samples from the outset, named as Joint learning with Negative samples\n(JN). Negative samples refer to scenarios where the object from the template is\nnot present in the search region, which helps to prevent the model from simply\nmemorizing the target, and instead encourages it to use the template for object\nlocation. To handle the negative samples effectively, we adopt a\ndistribution-based head, which modeling the bounding box as distribution of\ndistances to express uncertainty about the target's location in the presence of\nnegative samples, offering an efficient way to manage the mixed sample\ntraining. Furthermore, our approach introduces a target-indicating token. It\nencapsulates the target's precise location within the template image. This\nmethod provides exact boundary details with negligible computational cost but\nimproving performance. Our model, JN-256, exhibits superior performance on\nchallenging benchmarks, achieving 75.8% AO on GOT-10k and 84.1% AUC on\nTrackingNet. Notably, JN-256 outperforms previous SOTA trackers that utilize\nlarger models and higher input resolutions, even though it is trained with only\nhalf the number of data sampled used in those works.\n","authors":["Qingmao Wei","Bi Zeng","Guotian Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.02903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02898v1","updated":"2023-09-06T10:41:30Z","published":"2023-09-06T10:41:30Z","title":"A Unified Framework for Discovering Discrete Symmetries","summary":" We consider the problem of learning a function respecting a symmetry from\namong a class of symmetries. We develop a unified framework that enables\nsymmetry discovery across a broad range of subgroups including locally\nsymmetric, dihedral and cyclic subgroups. At the core of the framework is a\nnovel architecture composed of linear and tensor-valued functions that\nexpresses functions invariant to these subgroups in a principled manner. The\nstructure of the architecture enables us to leverage multi-armed bandit\nalgorithms and gradient descent to efficiently optimize over the linear and the\ntensor-valued functions, respectively, and to infer the symmetry that is\nultimately learnt. We also discuss the necessity of the tensor-valued functions\nin the architecture. Experiments on image-digit sum and polynomial regression\ntasks demonstrate the effectiveness of our approach.\n","authors":["Pavan Karjol","Rohan Kashyap","Aditya Gopalan","Prathosh A. P"],"pdf_url":"https://arxiv.org/pdf/2309.02898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01479v2","updated":"2023-09-06T10:23:59Z","published":"2023-09-04T09:34:33Z","title":"Parameter and Computation Efficient Transfer Learning for\n Vision-Language Pre-trained Models","summary":" With ever increasing parameters and computation, vision-language pre-trained\n(VLP) models exhibit prohibitive expenditure in downstream task adaption.\nRecent endeavors mainly focus on parameter efficient transfer learning (PETL)\nfor VLP models by only updating a small number of parameters. However,\nexcessive computational overhead still plagues the application of VLPs. In this\npaper, we aim at parameter and computation efficient transfer learning (PCETL)\nfor VLP models. In particular, PCETL not only needs to limit the number of\ntrainable parameters in VLP models, but also to reduce the computational\nredundancy during inference, thus enabling a more efficient transfer. To\napproach this target, we propose a novel dynamic architecture skipping (DAS)\napproach towards effective PCETL. Instead of directly optimizing the intrinsic\narchitectures of VLP models, DAS first observes the significances of their\nmodules to downstream tasks via a reinforcement learning (RL) based process,\nand then skips the redundant ones with lightweight networks, i.e., adapters,\naccording to the obtained rewards. In this case, the VLP model can well\nmaintain the scale of trainable parameters while speeding up its inference on\ndownstream tasks. To validate DAS, we apply it to two representative VLP\nmodels, namely ViLT and METER, and conduct extensive experiments on a bunch of\nVL tasks. The experimental results not only show the great advantages of DAS in\nreducing computational complexity, e.g. -11.97% FLOPs of METER on VQA2.0, but\nalso confirm its competitiveness against existing PETL methods in terms of\nparameter scale and performance. Our source code is given in our appendix.\n","authors":["Qiong Wu","Wei Yu","Yiyi Zhou","Shubin Huang","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2309.01479v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02875v1","updated":"2023-09-06T09:59:58Z","published":"2023-09-06T09:59:58Z","title":"MAD: Modality Agnostic Distance Measure for Image Registration","summary":" Multi-modal image registration is a crucial pre-processing step in many\nmedical applications. However, it is a challenging task due to the complex\nintensity relationships between different imaging modalities, which can result\nin large discrepancy in image appearance. The success of multi-modal image\nregistration, whether it is conventional or learning based, is predicated upon\nthe choice of an appropriate distance (or similarity) measure. Particularly,\ndeep learning registration algorithms lack in accuracy or even fail completely\nwhen attempting to register data from an \"unseen\" modality. In this work, we\npresent Modality Agnostic Distance (MAD), a deep image distance}] measure that\nutilises random convolutions to learn the inherent geometry of the images while\nbeing robust to large appearance changes. Random convolutions are\ngeometry-preserving modules which we use to simulate an infinite number of\nsynthetic modalities alleviating the need for aligned paired data during\ntraining. We can therefore train MAD on a mono-modal dataset and successfully\napply it to a multi-modal dataset. We demonstrate that not only can MAD\naffinely register multi-modal images successfully, but it has also a larger\ncapture range than traditional measures such as Mutual Information and\nNormalised Gradient Fields.\n","authors":["Vasiliki Sideri-Lampretsa","Veronika A. Zimmer","Huaqi Qiu","Georgios Kaissis","Daniel Rueckert"],"pdf_url":"https://arxiv.org/pdf/2309.02875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02861v1","updated":"2023-09-06T09:42:16Z","published":"2023-09-06T09:42:16Z","title":"Image Aesthetics Assessment via Learnable Queries","summary":" Image aesthetics assessment (IAA) aims to estimate the aesthetics of images.\nDepending on the content of an image, diverse criteria need to be selected to\nassess its aesthetics. Existing works utilize pre-trained vision backbones\nbased on content knowledge to learn image aesthetics. However, training those\nbackbones is time-consuming and suffers from attention dispersion. Inspired by\nlearnable queries in vision-language alignment, we propose the Image Aesthetics\nAssessment via Learnable Queries (IAA-LQ) approach. It adapts learnable queries\nto extract aesthetic features from pre-trained image features obtained from a\nfrozen image encoder. Extensive experiments on real-world data demonstrate the\nadvantages of IAA-LQ, beating the best state-of-the-art method by 2.2% and 2.1%\nin terms of SRCC and PLCC, respectively.\n","authors":["Zhiwei Xiong","Yunfan Zhang","Zhiqi Shen","Peiran Ren","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2309.02861v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.01740v2","updated":"2023-09-06T09:34:53Z","published":"2023-09-04T17:58:01Z","title":"An Empirical Analysis for Zero-Shot Multi-Label Classification on\n COVID-19 CT Scans and Uncurated Reports","summary":" The pandemic resulted in vast repositories of unstructured data, including\nradiology reports, due to increased medical examinations. Previous research on\nautomated diagnosis of COVID-19 primarily focuses on X-ray images, despite\ntheir lower precision compared to computed tomography (CT) scans. In this work,\nwe leverage unstructured data from a hospital and harness the fine-grained\ndetails offered by CT scans to perform zero-shot multi-label classification\nbased on contrastive visual language learning. In collaboration with human\nexperts, we investigate the effectiveness of multiple zero-shot models that aid\nradiologists in detecting pulmonary embolisms and identifying intricate lung\ndetails like ground glass opacities and consolidations. Our empirical analysis\nprovides an overview of the possible solutions to target such fine-grained\ntasks, so far overlooked in the medical multimodal pretraining literature. Our\ninvestigation promises future advancements in the medical image analysis\ncommunity by addressing some challenges associated with unstructured data and\nfine-grained multi-label classification.\n","authors":["Ethan Dack","Lorenzo Brigato","Matthew McMurray","Matthias Fontanellaz","Thomas Frauenfelder","Hanno Hoppe","Aristomenis Exadaktylos","Thomas Geiser","Manuela Funke-Chambour","Andreas Christe","Lukas Ebner","Stavroula Mougiakakou"],"pdf_url":"https://arxiv.org/pdf/2309.01740v2.pdf","comment":"Proceedings of the IEEE/CVF International Conference on Computer\n Vision (ICCV) Workshops 2023"},{"id":"http://arxiv.org/abs/2309.02855v1","updated":"2023-09-06T09:31:37Z","published":"2023-09-06T09:31:37Z","title":"Bandwidth-efficient Inference for Neural Image Compression","summary":" With neural networks growing deeper and feature maps growing larger, limited\ncommunication bandwidth with external memory (or DRAM) and power constraints\nbecome a bottleneck in implementing network inference on mobile and edge\ndevices. In this paper, we propose an end-to-end differentiable bandwidth\nefficient neural inference method with the activation compressed by neural data\ncompression method. Specifically, we propose a transform-quantization-entropy\ncoding pipeline for activation compression with symmetric exponential Golomb\ncoding and a data-dependent Gaussian entropy model for arithmetic coding.\nOptimized with existing model quantization methods, low-level task of image\ncompression can achieve up to 19x bandwidth reduction with 6.21x energy saving.\n","authors":["Shanzhi Yin","Tongda Xu","Yongsheng Liang","Yuanyuan Wang","Yanghao Li","Yan Wang","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02855v1.pdf","comment":"9 pages, 6 figures, submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2306.11048v2","updated":"2023-09-06T09:17:48Z","published":"2023-06-19T16:26:25Z","title":"UncLe-SLAM: Uncertainty Learning for Dense Neural SLAM","summary":" We present an uncertainty learning framework for dense neural simultaneous\nlocalization and mapping (SLAM). Estimating pixel-wise uncertainties for the\ndepth input of dense SLAM methods allows re-weighing the tracking and mapping\nlosses towards image regions that contain more suitable information that is\nmore reliable for SLAM. To this end, we propose an online framework for sensor\nuncertainty estimation that can be trained in a self-supervised manner from\nonly 2D input data. We further discuss the advantages of the uncertainty\nlearning for the case of multi-sensor input. Extensive analysis,\nexperimentation, and ablations show that our proposed modeling paradigm\nimproves both mapping and tracking accuracy and often performs better than\nalternatives that require ground truth depth or 3D. Our experiments show that\nwe achieve a 38\\% and 27\\% lower absolute trajectory tracking error (ATE) on\nthe 7-Scenes and TUM-RGBD datasets respectively. On the popular Replica dataset\nusing two types of depth sensors, we report an 11\\% F1-score improvement on\nRGBD SLAM compared to the recent state-of-the-art neural implicit approaches.\nSource code: https://github.com/kev-in-ta/UncLe-SLAM.\n","authors":["Erik Sandström","Kevin Ta","Luc Van Gool","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2306.11048v2.pdf","comment":"ICCV 2023 Workshop. 20 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02843v1","updated":"2023-09-06T09:05:03Z","published":"2023-09-06T09:05:03Z","title":"Knowledge Distillation Layer that Lets the Student Decide","summary":" Typical technique in knowledge distillation (KD) is regularizing the learning\nof a limited capacity model (student) by pushing its responses to match a\npowerful model's (teacher). Albeit useful especially in the penultimate layer\nand beyond, its action on student's feature transform is rather implicit,\nlimiting its practice in the intermediate layers. To explicitly embed the\nteacher's knowledge in feature transform, we propose a learnable KD layer for\nthe student which improves KD with two distinct abilities: i) learning how to\nleverage the teacher's knowledge, enabling to discard nuisance information, and\nii) feeding forward the transferred knowledge deeper. Thus, the student enjoys\nthe teacher's knowledge during the inference besides training. Formally, we\nrepurpose 1x1-BN-ReLU-1x1 convolution block to assign a semantic vector to each\nlocal region according to the template (supervised by the teacher) that the\ncorresponding region of the student matches. To facilitate template learning in\nthe intermediate layers, we propose a novel form of supervision based on the\nteacher's decisions. Through rigorous experimentation, we demonstrate the\neffectiveness of our approach on 3 popular classification benchmarks. Code is\navailable at: https://github.com/adagorgun/letKD-framework\n","authors":["Ada Gorgun","Yeti Z. Gurbuz","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2309.02843v1.pdf","comment":"Accepted at the British Machine Vision Conference 2023 (BMVC 2023)"},{"id":"http://arxiv.org/abs/2309.02841v1","updated":"2023-09-06T08:59:15Z","published":"2023-09-06T08:59:15Z","title":"Adjacency-hopping de Bruijn Sequences for Non-repetitive Coding","summary":" A special type of cyclic sequences named adjacency-hopping de Bruijn\nsequences is introduced in this paper. It is theoretically proved the existence\nof such sequences, and the number of such sequences is derived. These sequences\nguarantee that all neighboring codes are different while retaining the\nuniqueness of subsequences, which is a significant characteristic of original\nde Bruijn sequences in coding and matching. At last, the adjacency-hopping de\nBruijn sequences are applied to structured light coding, and a color fringe\npattern coded by such a sequence is presented. In summary, the proposed\nsequences demonstrate significant advantages in structured light coding by\nvirtue of the uniqueness of subsequences and the adjacency-hopping\ncharacteristic, and show potential for extension to other fields with similar\nrequirements of non-repetitive coding and efficient matching.\n","authors":["Bin Chen","Zhenglin Liang","Shiqian Wu"],"pdf_url":"https://arxiv.org/pdf/2309.02841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02833v1","updated":"2023-09-06T08:40:01Z","published":"2023-09-06T08:40:01Z","title":"Image-Object-Specific Prompt Learning for Few-Shot Class-Incremental\n Learning","summary":" While many FSCIL studies have been undertaken, achieving satisfactory\nperformance, especially during incremental sessions, has remained challenging.\nOne prominent challenge is that the encoder, trained with an ample base session\ntraining set, often underperforms in incremental sessions. In this study, we\nintroduce a novel training framework for FSCIL, capitalizing on the\ngeneralizability of the Contrastive Language-Image Pre-training (CLIP) model to\nunseen classes. We achieve this by formulating image-object-specific (IOS)\nclassifiers for the input images. Here, an IOS classifier refers to one that\ntargets specific attributes (like wings or wheels) of class objects rather than\nthe image's background. To create these IOS classifiers, we encode a bias\nprompt into the classifiers using our specially designed module, which\nharnesses key-prompt pairs to pinpoint the IOS features of classes in each\nsession. From an FSCIL standpoint, our framework is structured to retain\nprevious knowledge and swiftly adapt to new sessions without forgetting or\noverfitting. This considers the updatability of modules in each session and\nsome tricks empirically found for fast convergence. Our approach consistently\ndemonstrates superior performance compared to state-of-the-art methods across\nthe miniImageNet, CIFAR100, and CUB200 datasets. Further, we provide additional\nexperiments to validate our learned model's ability to achieve IOS classifiers.\nWe also conduct ablation studies to analyze the impact of each module within\nthe architecture.\n","authors":["In-Ug Yoon","Tae-Min Choi","Sun-Kyung Lee","Young-Min Kim","Jong-Hwan Kim"],"pdf_url":"https://arxiv.org/pdf/2309.02833v1.pdf","comment":"8 pages, 4 figures, 4 tables"},{"id":"http://arxiv.org/abs/2303.08356v3","updated":"2023-09-06T08:15:03Z","published":"2023-03-15T04:15:57Z","title":"Leveraging TCN and Transformer for effective visual-audio fusion in\n continuous emotion recognition","summary":" Human emotion recognition plays an important role in human-computer\ninteraction. In this paper, we present our approach to the Valence-Arousal (VA)\nEstimation Challenge, Expression (Expr) Classification Challenge, and Action\nUnit (AU) Detection Challenge of the 5th Workshop and Competition on Affective\nBehavior Analysis in-the-wild (ABAW). Specifically, we propose a novel\nmulti-modal fusion model that leverages Temporal Convolutional Networks (TCN)\nand Transformer to enhance the performance of continuous emotion recognition.\nOur model aims to effectively integrate visual and audio information for\nimproved accuracy in recognizing emotions. Our model outperforms the baseline\nand ranks 3 in the Expression Classification challenge.\n","authors":["Weiwei Zhou","Jiada Lu","Zhaolong Xiong","Weifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2303.08356v3.pdf","comment":"2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition\n Workshops (CVPRW)"},{"id":"http://arxiv.org/abs/2307.06751v2","updated":"2023-09-06T08:14:13Z","published":"2023-07-13T13:41:32Z","title":"Watch Where You Head: A View-biased Domain Gap in Gait Recognition and\n Unsupervised Adaptation","summary":" Gait Recognition is a computer vision task aiming to identify people by their\nwalking patterns. Although existing methods often show high performance on\nspecific datasets, they lack the ability to generalize to unseen scenarios.\nUnsupervised Domain Adaptation (UDA) tries to adapt a model, pre-trained in a\nsupervised manner on a source domain, to an unlabelled target domain. There are\nonly a few works on UDA for gait recognition proposing solutions to limited\nscenarios. In this paper, we reveal a fundamental phenomenon in adaptation of\ngait recognition models, caused by the bias in the target domain to viewing\nangle or walking direction. We then suggest a remedy to reduce this bias with a\nnovel triplet selection strategy combined with curriculum learning. To this\nend, we present Gait Orientation-based method for Unsupervised Domain\nAdaptation (GOUDA). We provide extensive experiments on four widely-used gait\ndatasets, CASIA-B, OU-MVLP, GREW, and Gait3D, and on three backbones, GaitSet,\nGaitPart, and GaitGL, justifying the view bias and showing the superiority of\nour proposed method over prior UDA works.\n","authors":["Gavriel Habib","Noa Barzilay","Or Shimshi","Rami Ben-Ari","Nir Darshan"],"pdf_url":"https://arxiv.org/pdf/2307.06751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01439v2","updated":"2023-09-06T07:55:39Z","published":"2023-09-04T08:38:11Z","title":"Large Separable Kernel Attention: Rethinking the Large Kernel Attention\n Design in CNN","summary":" Visual Attention Networks (VAN) with Large Kernel Attention (LKA) modules\nhave been shown to provide remarkable performance, that surpasses Vision\nTransformers (ViTs), on a range of vision-based tasks. However, the depth-wise\nconvolutional layer in these LKA modules incurs a quadratic increase in the\ncomputational and memory footprints with increasing convolutional kernel size.\nTo mitigate these problems and to enable the use of extremely large\nconvolutional kernels in the attention modules of VAN, we propose a family of\nLarge Separable Kernel Attention modules, termed LSKA. LSKA decomposes the 2D\nconvolutional kernel of the depth-wise convolutional layer into cascaded\nhorizontal and vertical 1-D kernels. In contrast to the standard LKA design,\nthe proposed decomposition enables the direct use of the depth-wise\nconvolutional layer with large kernels in the attention module, without\nrequiring any extra blocks. We demonstrate that the proposed LSKA module in VAN\ncan achieve comparable performance with the standard LKA module and incur lower\ncomputational complexity and memory footprints. We also find that the proposed\nLSKA design biases the VAN more toward the shape of the object than the texture\nwith increasing kernel size. Additionally, we benchmark the robustness of the\nLKA and LSKA in VAN, ViTs, and the recent ConvNeXt on the five corrupted\nversions of the ImageNet dataset that are largely unexplored in the previous\nworks. Our extensive experimental results show that the proposed LSKA module in\nVAN provides a significant reduction in computational complexity and memory\nfootprints with increasing kernel size while outperforming ViTs, ConvNeXt, and\nproviding similar performance compared to the LKA module in VAN on object\nrecognition, object detection, semantic segmentation, and robustness tests.\n","authors":["Kin Wai Lau","Lai-Man Po","Yasar Abbas Ur Rehman"],"pdf_url":"https://arxiv.org/pdf/2309.01439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02801v1","updated":"2023-09-06T07:39:51Z","published":"2023-09-06T07:39:51Z","title":"3D Trajectory Reconstruction of Drones using a Single Camera","summary":" Drones have been widely utilized in various fields, but the number of drones\nbeing used illegally and for hazardous purposes has increased recently. To\nprevent those illegal drones, in this work, we propose a novel framework for\nreconstructing 3D trajectories of drones using a single camera. By leveraging\ncalibrated cameras, we exploit the relationship between 2D and 3D spaces. We\nautomatically track the drones in 2D images using the drone tracker and\nestimate their 2D rotations. By combining the estimated 2D drone positions with\ntheir actual length information and camera parameters, we geometrically infer\nthe 3D trajectories of the drones. To address the lack of public drone\ndatasets, we also create synthetic 2D and 3D drone datasets. The experimental\nresults show that the proposed methods accurately reconstruct drone\ntrajectories in 3D space, and demonstrate the potential of our framework for\nsingle camera-based surveillance systems.\n","authors":["Seobin Hwang","Hanyoung Kim","Chaeyeon Heo","Youkyoung Na","Cheongeun Lee","Yeongjun Cho"],"pdf_url":"https://arxiv.org/pdf/2309.02801v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2211.10437v3","updated":"2023-09-06T07:28:33Z","published":"2022-11-18T18:59:01Z","title":"A Structure-Guided Diffusion Model for Large-Hole Image Completion","summary":" Image completion techniques have made significant progress in filling missing\nregions (i.e., holes) in images. However, large-hole completion remains\nchallenging due to limited structural information. In this paper, we address\nthis problem by integrating explicit structural guidance into diffusion-based\nimage completion, forming our structure-guided diffusion model (SGDM). It\nconsists of two cascaded diffusion probabilistic models: structure and texture\ngenerators. The structure generator generates an edge image representing\nplausible structures within the holes, which is then used for guiding the\ntexture generation process. To train both generators jointly, we devise a novel\nstrategy that leverages optimal Bayesian denoising, which denoises the output\nof the structure generator in a single step and thus allows backpropagation.\nOur diffusion-based approach enables a diversity of plausible completions,\nwhile the editable edges allow for editing parts of an image. Our experiments\non natural scene (Places) and face (CelebA-HQ) datasets demonstrate that our\nmethod achieves a superior or comparable visual quality compared to\nstate-of-the-art approaches. The code is available for research purposes at\nhttps://github.com/UdonDa/Structure_Guided_Diffusion_Model.\n","authors":["Daichi Horita","Jiaolong Yang","Dong Chen","Yuki Koyama","Kiyoharu Aizawa","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2211.10437v3.pdf","comment":"BMVC2023. Code:\n https://github.com/UdonDa/Structure_Guided_Diffusion_Model"},{"id":"http://arxiv.org/abs/2304.06322v4","updated":"2023-09-06T07:28:33Z","published":"2023-04-13T08:02:38Z","title":"Learning-based Spatial and Angular Information Separation for Light\n Field Compression","summary":" Light fields are a type of image data that capture both spatial and angular\nscene information by recording light rays emitted by a scene from different\norientations. In this context, spatial information is defined as features that\nremain static regardless of perspectives, while angular information refers to\nfeatures that vary between viewpoints. We propose a novel neural network that,\nby design, can separate angular and spatial information of a light field. The\nnetwork represents spatial information using spatial kernels shared among all\nSub-Aperture Images (SAIs), and angular information using sets of angular\nkernels for each SAI. To further improve the representation capability of the\nnetwork without increasing parameter number, we also introduce angular kernel\nallocation and kernel tensor decomposition mechanisms. Extensive experiments\ndemonstrate the benefits of information separation: when applied to the\ncompression task, our network outperforms other state-of-the-art methods by a\nlarge margin. And angular information can be easily transferred to other scenes\nfor rendering dense views, showing the successful separation and the potential\nuse case for the view synthesis task. We plan to release the code upon\nacceptance of the paper to encourage further research on this topic.\n","authors":["Jinglei Shi","Yihong Xu","Christine Guillemot"],"pdf_url":"https://arxiv.org/pdf/2304.06322v4.pdf","comment":"The authors would like to withdraw this paper, as it has been\n superseded by arXiv:2307.06143"},{"id":"http://arxiv.org/abs/2309.02197v2","updated":"2023-09-06T07:24:10Z","published":"2023-09-05T12:57:32Z","title":"Delving into Ipsilateral Mammogram Assessment under Multi-View Network","summary":" In many recent years, multi-view mammogram analysis has been focused widely\non AI-based cancer assessment. In this work, we aim to explore diverse fusion\nstrategies (average and concatenate) and examine the model's learning behavior\nwith varying individuals and fusion pathways, involving Coarse Layer and Fine\nLayer. The Ipsilateral Multi-View Network, comprising five fusion types (Pre,\nEarly, Middle, Last, and Post Fusion) in ResNet-18, is employed. Notably, the\nMiddle Fusion emerges as the most balanced and effective approach, enhancing\ndeep-learning models' generalization performance by +2.06% (concatenate) and\n+5.29% (average) in VinDr-Mammo dataset and +2.03% (concatenate) and +3%\n(average) in CMMD dataset on macro F1-Score. The paper emphasizes the crucial\nrole of layer assignment in multi-view network extraction with various\nstrategies.\n","authors":["Thai Ngoc Toan Truong","Thanh-Huy Nguyen","Ba Thinh Lam","Vu Minh Duy Nguyen","Hong Phuc Nguyen"],"pdf_url":"https://arxiv.org/pdf/2309.02197v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.12135v4","updated":"2023-09-06T07:17:01Z","published":"2021-09-24T18:02:49Z","title":"Attentive Contractive Flow with Lipschitz-constrained Self-Attention","summary":" Normalizing flows provide an elegant method for obtaining tractable density\nestimates from distributions by using invertible transformations. The main\nchallenge is to improve the expressivity of the models while keeping the\ninvertibility constraints intact. We propose to do so via the incorporation of\nlocalized self-attention. However, conventional self-attention mechanisms don't\nsatisfy the requirements to obtain invertible flows and can't be naively\nincorporated into normalizing flows. To address this, we introduce a novel\napproach called Attentive Contractive Flow (ACF) which utilizes a special\ncategory of flow-based generative models - contractive flows. We demonstrate\nthat ACF can be introduced into a variety of state of the art flow models in a\nplug-and-play manner. This is demonstrated to not only improve the\nrepresentation power of these models (improving on the bits per dim metric),\nbut also to results in significantly faster convergence in training them.\nQualitative results, including interpolations between test images, demonstrate\nthat samples are more realistic and capture local correlations in the data\nwell. We evaluate the results further by performing perturbation analysis using\nAWGN demonstrating that ACF models (especially the dot-product variant) show\nbetter and more consistent resilience to additive noise.\n","authors":["Avideep Mukherjee","Badri Narayan Patro","Vinay P. Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2109.12135v4.pdf","comment":"10 pages, to be published at BMVC 2023"},{"id":"http://arxiv.org/abs/2212.05315v2","updated":"2023-09-06T06:58:29Z","published":"2022-12-10T14:49:24Z","title":"Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular\n Depth Estimation","summary":" Monocular Depth Estimation (MDE) is a fundamental problem in computer vision\nwith numerous applications. Recently, LIDAR-supervised methods have achieved\nremarkable per-pixel depth accuracy in outdoor scenes. However, significant\nerrors are typically found in the proximity of depth discontinuities, i.e.,\ndepth edges, which often hinder the performance of depth-dependent applications\nthat are sensitive to such inaccuracies, e.g., novel view synthesis and\naugmented reality. Since direct supervision for the location of depth edges is\ntypically unavailable in sparse LIDAR-based scenes, encouraging the MDE model\nto produce correct depth edges is not straightforward. To the best of our\nknowledge this paper is the first attempt to address the depth edges issue for\nLIDAR-supervised scenes. In this work we propose to learn to detect the\nlocation of depth edges from densely-supervised synthetic data, and use it to\ngenerate supervision for the depth edges in the MDE training. %Despite the\n'domain gap' between synthetic and real data, we show that depth edges that are\nestimated directly are significantly more accurate than the ones that emerge\nindirectly from the MDE training. To quantitatively evaluate our approach, and\ndue to the lack of depth edges ground truth in LIDAR-based scenes, we manually\nannotated subsets of the KITTI and the DDAD datasets with depth edges ground\ntruth. We demonstrate significant gains in the accuracy of the depth edges with\ncomparable per-pixel depth accuracy on several challenging datasets.\n","authors":["Lior Talker","Aviad Cohen","Erez Yosef","Alexandra Dana","Michael Dinerstein"],"pdf_url":"https://arxiv.org/pdf/2212.05315v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09682v3","updated":"2023-09-06T06:53:43Z","published":"2023-06-16T08:26:57Z","title":"OCTScenes: A Versatile Real-World Dataset of Tabletop Scenes for\n Object-Centric Learning","summary":" Humans possess the cognitive ability to comprehend scenes in a compositional\nmanner. To empower AI systems with similar capabilities, object-centric\nlearning aims to acquire representations of individual objects from visual\nscenes without any supervision. Although recent advances in object-centric\nlearning have made remarkable progress on complex synthesis datasets, there is\na huge challenge for application to complex real-world scenes. One of the\nessential reasons is the scarcity of real-world datasets specifically tailored\nto object-centric learning. To address this problem, we propose a versatile\nreal-world dataset of tabletop scenes for object-centric learning called\nOCTScenes, which is meticulously designed to serve as a benchmark for\ncomparing, evaluating, and analyzing object-centric learning methods. OCTScenes\ncontains 5000 tabletop scenes with a total of 15 objects. Each scene is\ncaptured in 60 frames covering a 360-degree perspective. Consequently,\nOCTScenes is a versatile benchmark dataset that can simultaneously satisfy the\nevaluation of object-centric learning methods based on single-image, video, and\nmulti-view. Extensive experiments of representative object-centric learning\nmethods are conducted on OCTScenes. The results demonstrate the shortcomings of\nstate-of-the-art methods for learning meaningful representations from\nreal-world data, despite their impressive performance on complex synthesis\ndatasets. Furthermore, OCTScenes can serve as a catalyst for the advancement of\nexisting methods, inspiring them to adapt to real-world scenes. Dataset and\ncode are available at https://huggingface.co/datasets/Yinxuan/OCTScenes.\n","authors":["Yinxuan Huang","Tonglin Chen","Zhimeng Shen","Jinghao Huang","Bin Li","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2306.09682v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02783v1","updated":"2023-09-06T06:49:31Z","published":"2023-09-06T06:49:31Z","title":"Improving diagnosis and prognosis of lung cancer using vision\n transformers: A scoping review","summary":" Vision transformer-based methods are advancing the field of medical\nartificial intelligence and cancer imaging, including lung cancer applications.\nRecently, many researchers have developed vision transformer-based AI methods\nfor lung cancer diagnosis and prognosis. This scoping review aims to identify\nthe recent developments on vision transformer-based AI methods for lung cancer\nimaging applications. It provides key insights into how vision transformers\ncomplemented the performance of AI and deep learning methods for lung cancer.\nFurthermore, the review also identifies the datasets that contributed to\nadvancing the field. Of the 314 retrieved studies, this review included 34\nstudies published from 2020 to 2022. The most commonly addressed task in these\nstudies was the classification of lung cancer types, such as lung squamous cell\ncarcinoma versus lung adenocarcinoma, and identifying benign versus malignant\npulmonary nodules. Other applications included survival prediction of lung\ncancer patients and segmentation of lungs. The studies lacked clear strategies\nfor clinical transformation. SWIN transformer was a popular choice of the\nresearchers; however, many other architectures were also reported where vision\ntransformer was combined with convolutional neural networks or UNet model. It\ncan be concluded that vision transformer-based models are increasingly in\npopularity for developing AI methods for lung cancer applications. However,\ntheir computational complexity and clinical relevance are important factors to\nbe considered for future research work. This review provides valuable insights\nfor researchers in the field of AI and healthcare to advance the\nstate-of-the-art in lung cancer diagnosis and prognosis. We provide an\ninteractive dashboard on lung-cancer.onrender.com/.\n","authors":["Hazrat Ali","Farida Mohsen","Zubair Shah"],"pdf_url":"https://arxiv.org/pdf/2309.02783v1.pdf","comment":"submitted to BMC Medical Imaging journal"},{"id":"http://arxiv.org/abs/2109.04398v4","updated":"2023-09-06T06:47:49Z","published":"2021-09-09T16:37:01Z","title":"Neural-IMLS: Self-supervised Implicit Moving Least-Squares Network for\n Surface Reconstruction","summary":" Surface reconstruction is very challenging when the input point clouds,\nparticularly real scans, are noisy and lack normals. Observing that the\nMultilayer Perceptron (MLP) and the implicit moving least-square function\n(IMLS) provide a dual representation of the underlying surface, we introduce\nNeural-IMLS, a novel approach that directly learns the noise-resistant signed\ndistance function (SDF) from unoriented raw point clouds in a self-supervised\nfashion. We use the IMLS to regularize the distance values reported by the MLP\nwhile using the MLP to regularize the normals of the data points for running\nthe IMLS. We also prove that at the convergence, our neural network, benefiting\nfrom the mutual learning mechanism between the MLP and the IMLS, produces a\nfaithful SDF whose zero-level set approximates the underlying surface. We\nconducted extensive experiments on various benchmarks, including synthetic\nscans and real scans. The experimental results show that {\\em Neural-IMLS} can\nreconstruct faithful shapes on various benchmarks with noise and missing parts.\nThe source code can be found at~\\url{https://github.com/bearprin/Neural-IMLS}.\n","authors":["Zixiong Wang","Pengfei Wang","Pengshuai Wang","Qiujie Dong","Junjie Gao","Shuangmin Chen","Shiqing Xin","Changhe Tu","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2109.04398v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02777v1","updated":"2023-09-06T06:41:40Z","published":"2023-09-06T06:41:40Z","title":"LightNeuS: Neural Surface Reconstruction in Endoscopy using Illumination\n Decline","summary":" We propose a new approach to 3D reconstruction from sequences of images\nacquired by monocular endoscopes. It is based on two key insights. First,\nendoluminal cavities are watertight, a property naturally enforced by modeling\nthem in terms of a signed distance function. Second, the scene illumination is\nvariable. It comes from the endoscope's light sources and decays with the\ninverse of the squared distance to the surface. To exploit these insights, we\nbuild on NeuS, a neural implicit surface reconstruction technique with an\noutstanding capability to learn appearance and a SDF surface model from\nmultiple views, but currently limited to scenes with static illumination. To\nremove this limitation and exploit the relation between pixel brightness and\ndepth, we modify the NeuS architecture to explicitly account for it and\nintroduce a calibrated photometric model of the endoscope's camera and light\nsource. Our method is the first one to produce watertight reconstructions of\nwhole colon sections. We demonstrate excellent accuracy on phantom imagery.\nRemarkably, the watertight prior combined with illumination decline, allows to\ncomplete the reconstruction of unseen portions of the surface with acceptable\naccuracy, paving the way to automatic quality assessment of cancer screening\nexplorations, measuring the global percentage of observed mucosa.\n","authors":["Víctor M. Batlle","José M. M. Montiel","Pascal Fua","Juan D. Tardós"],"pdf_url":"https://arxiv.org/pdf/2309.02777v1.pdf","comment":"12 pages, 7 figures, 1 table, submitted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.01793v2","updated":"2023-09-06T06:39:06Z","published":"2023-09-04T20:10:38Z","title":"Neural-Singular-Hessian: Implicit Neural Representation of Unoriented\n Point Clouds by Enforcing Singular Hessian","summary":" Neural implicit representation is a promising approach for reconstructing\nsurfaces from point clouds. Existing methods combine various regularization\nterms, such as the Eikonal and Laplacian energy terms, to enforce the learned\nneural function to possess the properties of a Signed Distance Function (SDF).\nHowever, inferring the actual topology and geometry of the underlying surface\nfrom poor-quality unoriented point clouds remains challenging. In accordance\nwith Differential Geometry, the Hessian of the SDF is singular for points\nwithin the differential thin-shell space surrounding the surface. Our approach\nenforces the Hessian of the neural implicit function to have a zero determinant\nfor points near the surface. This technique aligns the gradients for a\nnear-surface point and its on-surface projection point, producing a rough but\nfaithful shape within just a few iterations. By annealing the weight of the\nsingular-Hessian term, our approach ultimately produces a high-fidelity\nreconstruction result. Extensive experimental results demonstrate that our\napproach effectively suppresses ghost geometry and recovers details from\nunoriented point clouds with better expressiveness than existing fitting-based\nmethods.\n","authors":["Zixiong Wang","Yunxiao Zhang","Rui Xu","Fan Zhang","Pengshuai Wang","Shuangmin Chen","Shiqing Xin","Wenping Wang","Changhe Tu"],"pdf_url":"https://arxiv.org/pdf/2309.01793v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02773v1","updated":"2023-09-06T06:31:08Z","published":"2023-09-06T06:31:08Z","title":"Diffusion Model is Secretly a Training-free Open Vocabulary Semantic\n Segmenter","summary":" Recent research has explored the utilization of pre-trained text-image\ndiscriminative models, such as CLIP, to tackle the challenges associated with\nopen-vocabulary semantic segmentation. However, it is worth noting that the\nalignment process based on contrastive learning employed by these models may\nunintentionally result in the loss of crucial localization information and\nobject completeness, which are essential for achieving accurate semantic\nsegmentation. More recently, there has been an emerging interest in extending\nthe application of diffusion models beyond text-to-image generation tasks,\nparticularly in the domain of semantic segmentation. These approaches utilize\ndiffusion models either for generating annotated data or for extracting\nfeatures to facilitate semantic segmentation. This typically involves training\nsegmentation models by generating a considerable amount of synthetic data or\nincorporating additional mask annotations. To this end, we uncover the\npotential of generative text-to-image conditional diffusion models as highly\nefficient open-vocabulary semantic segmenters, and introduce a novel\ntraining-free approach named DiffSegmenter. Specifically, by feeding an input\nimage and candidate classes into an off-the-shelf pre-trained conditional\nlatent diffusion model, the cross-attention maps produced by the denoising\nU-Net are directly used as segmentation scores, which are further refined and\ncompleted by the followed self-attention maps. Additionally, we carefully\ndesign effective textual prompts and a category filtering mechanism to further\nenhance the segmentation results. Extensive experiments on three benchmark\ndatasets show that the proposed DiffSegmenter achieves impressive results for\nopen-vocabulary semantic segmentation.\n","authors":["Jinglong Wang","Xiawei Li","Jing Zhang","Qingyuan Xu","Qin Zhou","Qian Yu","Lu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2309.02773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01961v2","updated":"2023-09-06T06:13:34Z","published":"2023-09-05T05:32:19Z","title":"NICE 2023 Zero-shot Image Captioning Challenge","summary":" In this report, we introduce NICE\nproject\\footnote{\\url{https://nice.lgresearch.ai/}} and share the results and\noutcomes of NICE challenge 2023. This project is designed to challenge the\ncomputer vision community to develop robust image captioning models that\nadvance the state-of-the-art both in terms of accuracy and fairness. Through\nthe challenge, the image captioning models were tested using a new evaluation\ndataset that includes a large variety of visual concepts from many domains.\nThere was no specific training data provided for the challenge, and therefore\nthe challenge entries were required to adapt to new types of image descriptions\nthat had not been seen during training. This report includes information on the\nnewly proposed NICE dataset, evaluation methods, challenge results, and\ntechnical details of top-ranking entries. We expect that the outcomes of the\nchallenge will contribute to the improvement of AI models on various\nvision-language tasks.\n","authors":["Taehoon Kim","Pyunghwan Ahn","Sangyun Kim","Sihaeng Lee","Mark Marsden","Alessandra Sala","Seung Hwan Kim","Bohyung Han","Kyoung Mu Lee","Honglak Lee","Kyounghoon Bae","Xiangyu Wu","Yi Gao","Hailiang Zhang","Yang Yang","Weili Guo","Jianfeng Lu","Youngtaek Oh","Jae Won Cho","Dong-jin Kim","In So Kweon","Junmo Kim","Wooyoung Kang","Won Young Jhoo","Byungseok Roh","Jonghwan Mun","Solgil Oh","Kenan Emir Ak","Gwang-Gook Lee","Yan Xu","Mingwei Shen","Kyomin Hwang","Wonsik Shin","Kamin Lee","Wonhark Park","Dongkwan Lee","Nojun Kwak","Yujin Wang","Yimu Wang","Tiancheng Gu","Xingchang Lv","Mingmao Sun"],"pdf_url":"https://arxiv.org/pdf/2309.01961v2.pdf","comment":"Tech report, project page https://nice.lgresearch.ai/"},{"id":"http://arxiv.org/abs/2309.02742v1","updated":"2023-09-06T05:56:30Z","published":"2023-09-06T05:56:30Z","title":"MLN-net: A multi-source medical image segmentation method for clustered\n microcalcifications using multiple layer normalization","summary":" Accurate segmentation of clustered microcalcifications in mammography is\ncrucial for the diagnosis and treatment of breast cancer. Despite exhibiting\nexpert-level accuracy, recent deep learning advancements in medical image\nsegmentation provide insufficient contribution to practical applications, due\nto the domain shift resulting from differences in patient postures, individual\ngland density, and imaging modalities of mammography etc. In this paper, a\nnovel framework named MLN-net, which can accurately segment multi-source images\nusing only single source images, is proposed for clustered microcalcification\nsegmentation. We first propose a source domain image augmentation method to\ngenerate multi-source images, leading to improved generalization. And a\nstructure of multiple layer normalization (LN) layers is used to construct the\nsegmentation network, which can be found efficient for clustered\nmicrocalcification segmentation in different domains. Additionally, a branch\nselection strategy is designed for measuring the similarity of the source\ndomain data and the target domain data. To validate the proposed MLN-net,\nextensive analyses including ablation experiments are performed, comparison of\n12 baseline methods. Extensive experiments validate the effectiveness of\nMLN-net in segmenting clustered microcalcifications from different domains and\nthe its segmentation accuracy surpasses state-of-the-art methods. Code will be\navailable at https://github.com/yezanting/MLN-NET-VERSON1.\n","authors":["Ke Wang","Zanting Ye","Xiang Xie","Haidong Cui","Tao Chen","Banteng Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02742v1.pdf","comment":"17 pages, 9 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.02719v1","updated":"2023-09-06T05:08:51Z","published":"2023-09-06T05:08:51Z","title":"DMKD: Improving Feature-based Knowledge Distillation for Object\n Detection Via Dual Masking Augmentation","summary":" Recent mainstream masked distillation methods function by reconstructing\nselectively masked areas of a student network from the feature map of its\nteacher counterpart. In these methods, the masked regions need to be properly\nselected, such that reconstructed features encode sufficient discrimination and\nrepresentation capability like the teacher feature. However, previous masked\ndistillation methods only focus on spatial masking, making the resulting masked\nareas biased towards spatial importance without encoding informative channel\nclues. In this study, we devise a Dual Masked Knowledge Distillation (DMKD)\nframework which can capture both spatially important and channel-wise\ninformative clues for comprehensive masked feature reconstruction. More\nspecifically, we employ dual attention mechanism for guiding the respective\nmasking branches, leading to reconstructed feature encoding dual significance.\nFurthermore, fusing the reconstructed features is achieved by self-adjustable\nweighting strategy for effective feature distillation. Our experiments on\nobject detection task demonstrate that the student networks achieve performance\ngains of 4.1% and 4.3% with the help of our method when RetinaNet and Cascade\nMask R-CNN are respectively used as the teacher networks, while outperforming\nthe other state-of-the-art distillation methods.\n","authors":["Guang Yang1","Yin Tang2","Zhijian Wu","Jun Li1","Jianhua Xu","Xili Wan"],"pdf_url":"https://arxiv.org/pdf/2309.02719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02713v1","updated":"2023-09-06T04:52:02Z","published":"2023-09-06T04:52:02Z","title":"SlAction: Non-intrusive, Lightweight Obstructive Sleep Apnea Detection\n using Infrared Video","summary":" Obstructive sleep apnea (OSA) is a prevalent sleep disorder affecting\napproximately one billion people world-wide. The current gold standard for\ndiagnosing OSA, Polysomnography (PSG), involves an overnight hospital stay with\nmultiple attached sensors, leading to potential inaccuracies due to the\nfirst-night effect. To address this, we present SlAction, a non-intrusive OSA\ndetection system for daily sleep environments using infrared videos.\nRecognizing that sleep videos exhibit minimal motion, this work investigates\nthe fundamental question: \"Are respiratory events adequately reflected in human\nmotions during sleep?\" Analyzing the largest sleep video dataset of 5,098\nhours, we establish correlations between OSA events and human motions during\nsleep. Our approach uses a low frame rate (2.5 FPS), a large size (60 seconds)\nand step (30 seconds) for sliding window analysis to capture slow and long-term\nmotions related to OSA. Furthermore, we utilize a lightweight deep neural\nnetwork for resource-constrained devices, ensuring all video streams are\nprocessed locally without compromising privacy. Evaluations show that SlAction\nachieves an average F1 score of 87.6% in detecting OSA across various\nenvironments. Implementing SlAction on NVIDIA Jetson Nano enables real-time\ninference (~3 seconds for a 60-second video clip), highlighting its potential\nfor early detection and personalized treatment of OSA.\n","authors":["You Rim Choi","Gyeongseon Eo","Wonhyuck Youn","Hyojin Lee","Haemin Jang","Dongyoon Kim","Hyunwoo Shin","Hyung-Sin Kim"],"pdf_url":"https://arxiv.org/pdf/2309.02713v1.pdf","comment":"Accepted to ICCV CVAMD 2023, poster"},{"id":"http://arxiv.org/abs/2306.03424v3","updated":"2023-09-06T04:36:33Z","published":"2023-06-06T05:51:50Z","title":"GCD-DDPM: A Generative Change Detection Model Based on\n Difference-Feature Guided DDPM","summary":" Deep learning (DL)-based methods have recently shown great promise in\nbitemporal change detection (CD). However, most existing methods are\nineffective in simultaneously capturing long-range dependencies and exploiting\nlocal spatial information, resulting in inaccurate CD maps with discerning\nedges. To overcome these obstacles, a novel Denoising Diffusion Probabilistic\nModel (DDPM)-based generative CD approach called GCD-DDPM is proposed for\nremote sensing data. More specifically, GCD-DDPM is designed to directly\ngenerate CD maps by leveraging variational inference, which enables GCD-DDPM to\naccurately distinguish subtle and irregular buildings or natural scenes from\nthe background. Furthermore, an adaptive calibration conditional difference\nencoding technique is proposed for GCD-DDPM to enhance the CD map through\nguided sampling of the differences among multi-level features. Finally, a noise\nsuppression-based semantic enhancer (NSSE) is devised to cope with the\nhigh-frequency noise incurred in the CD map by capitalizing on the prior\nknowledge derived from the current step. Extensive experiments on four CD\ndatasets, namely CDD, WHU, Levier and GVLM, confirm the good performance of the\nproposed GCD-DDPM.\n","authors":["Yihan Wen","Xianping Ma","Xiaokang Zhang","Man-On Pun"],"pdf_url":"https://arxiv.org/pdf/2306.03424v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04205v3","updated":"2023-09-06T04:33:41Z","published":"2023-05-07T07:03:40Z","title":"Bi-Mapper: Holistic BEV Semantic Mapping for Autonomous Driving","summary":" A semantic map of the road scene, covering fundamental road elements, is an\nessential ingredient in autonomous driving systems. It provides important\nperception foundations for positioning and planning when rendered in the\nBird's-Eye-View (BEV). Currently, the prior knowledge of hypothetical depth can\nguide the learning of translating front perspective views into BEV directly\nwith the help of calibration parameters. However, it suffers from geometric\ndistortions in the representation of distant objects. In addition, another\nstream of methods without prior knowledge can learn the transformation between\nfront perspective views and BEV implicitly with a global view. Considering that\nthe fusion of different learning methods may bring surprising beneficial\neffects, we propose a Bi-Mapper framework for top-down road-scene semantic\nunderstanding, which incorporates a global view and local prior knowledge. To\nenhance reliable interaction between them, an asynchronous mutual learning\nstrategy is proposed. At the same time, an Across-Space Loss (ASL) is designed\nto mitigate the negative impact of geometric distortions. Extensive results on\nnuScenes and Cam2BEV datasets verify the consistent effectiveness of each\nmodule in the proposed Bi-Mapper framework. Compared with exiting road mapping\nnetworks, the proposed Bi-Mapper achieves 2.1% higher IoU on the nuScenes\ndataset. Moreover, we verify the generalization performance of Bi-Mapper in a\nreal-world driving scenario. The source code is publicly available at\nhttps://github.com/lynn-yu/Bi-Mapper.\n","authors":["Siyu Li","Kailun Yang","Hao Shi","Jiaming Zhang","Jiacheng Lin","Zhifeng Teng","Zhiyong Li"],"pdf_url":"https://arxiv.org/pdf/2305.04205v3.pdf","comment":"Accepted to IEEE Robotics and Automation Letters (RA-L). The source\n code is publicly available at https://github.com/lynn-yu/Bi-Mapper"},{"id":"http://arxiv.org/abs/2309.02702v1","updated":"2023-09-06T04:30:15Z","published":"2023-09-06T04:30:15Z","title":"Gene-induced Multimodal Pre-training for Image-omic Classification","summary":" Histology analysis of the tumor micro-environment integrated with genomic\nassays is the gold standard for most cancers in modern medicine. This paper\nproposes a Gene-induced Multimodal Pre-training (GiMP) framework, which jointly\nincorporates genomics and Whole Slide Images (WSIs) for classification tasks.\nOur work aims at dealing with the main challenges of multi-modality image-omic\nclassification w.r.t. (1) the patient-level feature extraction difficulties\nfrom gigapixel WSIs and tens of thousands of genes, and (2) effective fusion\nconsidering high-order relevance modeling. Concretely, we first propose a group\nmulti-head self-attention gene encoder to capture global structured features in\ngene expression cohorts. We design a masked patch modeling paradigm (MPM) to\ncapture the latent pathological characteristics of different tissues. The mask\nstrategy is randomly masking a fixed-length contiguous subsequence of patch\nembeddings of a WSI. Finally, we combine the classification tokens of paired\nmodalities and propose a triplet learning module to learn high-order relevance\nand discriminative patient-level information.After pre-training, a simple\nfine-tuning can be adopted to obtain the classification results. Experimental\nresults on the TCGA dataset show the superiority of our network architectures\nand our pre-training framework, achieving 99.47% in accuracy for image-omic\nclassification. The code is publicly available at\nhttps://github.com/huangwudiduan/GIMP.\n","authors":["Ting Jin","Xingran Xie","Renjie Wan","Qingli Li","Yan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.02702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06200v2","updated":"2023-09-06T04:16:17Z","published":"2023-05-10T14:32:21Z","title":"Learning in a Single Domain for Non-Stationary Multi-Texture Synthesis","summary":" This paper aims for a new generation task: non-stationary multi-texture\nsynthesis, which unifies synthesizing multiple non-stationary textures in a\nsingle model. Most non-stationary textures have large scale variance and can\nhardly be synthesized through one model. To combat this, we propose a\nmulti-scale generator to capture structural patterns of various scales and\neffectively synthesize textures with a minor cost. However, it is still hard to\nhandle textures of different categories with different texture patterns.\nTherefore, we present a category-specific training strategy to focus on\nlearning texture pattern of a specific domain. Interestingly, once trained, our\nmodel is able to produce multi-pattern generations with dynamic variations\nwithout the need to finetune the model for different styles. Moreover, an\nobjective evaluation metric is designed for evaluating the quality of texture\nexpansion and global structure consistency. To our knowledge, ours is the first\nscheme for this challenging task, including model, training, and evaluation.\nExperimental results demonstrate the proposed method achieves superior\nperformance and time efficiency. The code will be available after the\npublication.\n","authors":["Xudong Xie","Zhen Zhu","Zijie Wu","Zhiliang Xu","Yingying Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.06200v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04027v4","updated":"2023-09-06T04:13:42Z","published":"2023-04-08T14:40:35Z","title":"Estimating 3D Dental Structures using Simulated Panoramic Radiographs\n and Neural Ray Tracing","summary":" Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality\nfor dental examination. However, PX only provides a flattened 2D image, lacking\nin a 3D view of the oral structure. In this paper, we propose a framework to\nestimate 3D oral structures from real-world PX. Our framework tackles full 3D\nreconstruction for varying subjects (patients) where each reconstruction is\nbased only on a single panoramic image. We create an intermediate\nrepresentation called simulated PX (SimPX) from 3D Cone-beam computed\ntomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and\nrotational principles of PX imaging. SimPX aims at not only truthfully\nsimulating PX, but also facilitates the reverting process back to 3D data. We\npropose a novel neural model based on ray tracing which exploits both global\nand local input features to convert SimPX to 3D output. At inference, a real PX\nimage is translated to a SimPX-style image with semantic regularization, and\nthe translated image is processed by generation module to produce high-quality\noutputs. Experiments show that our method outperforms prior state-of-the-art in\nreconstruction tasks both quantitatively and qualitatively. Unlike prior\nmethods, Our method does not require any prior information such as the shape of\ndental arches, nor the matched PX-CBCT dataset for training, which is difficult\nto obtain in clinical practice.\n","authors":["Sihwa Park","Seongjun Kim","Doeyoung Kwon","Yohan Jang","In-Seok Song","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2304.04027v4.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2309.01539v2","updated":"2023-09-06T04:12:35Z","published":"2023-09-04T11:39:14Z","title":"TSTTC: A Large-Scale Dataset for Time-to-Contact Estimation in Driving\n Scenarios","summary":" Time-to-Contact (TTC) estimation is a critical task for assessing collision\nrisk and is widely used in various driver assistance and autonomous driving\nsystems. The past few decades have witnessed development of related theories\nand algorithms. The prevalent learning-based methods call for a large-scale TTC\ndataset in real-world scenarios. In this work, we present a large-scale object\noriented TTC dataset in the driving scene for promoting the TTC estimation by a\nmonocular camera. To collect valuable samples and make data with different TTC\nvalues relatively balanced, we go through thousands of hours of driving data\nand select over 200K sequences with a preset data distribution. To augment the\nquantity of small TTC cases, we also generate clips using the latest Neural\nrendering methods. Additionally, we provide several simple yet effective TTC\nestimation baselines and evaluate them extensively on the proposed dataset to\ndemonstrate their effectiveness. The proposed dataset is publicly available at\nhttps://open-dataset.tusen.ai/TSTTC.\n","authors":["Yuheng Shi","Zehao Huang","Yan Yan","Naiyan Wang","Xiaojie Guo"],"pdf_url":"https://arxiv.org/pdf/2309.01539v2.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02691v1","updated":"2023-09-06T03:54:57Z","published":"2023-09-06T03:54:57Z","title":"A Joint Study of Phrase Grounding and Task Performance in Vision and\n Language Models","summary":" Key to tasks that require reasoning about natural language in visual contexts\nis grounding words and phrases to image regions. However, observing this\ngrounding in contemporary models is complex, even if it is generally expected\nto take place if the task is addressed in a way that is conductive to\ngeneralization. We propose a framework to jointly study task performance and\nphrase grounding, and propose three benchmarks to study the relation between\nthe two. Our results show that contemporary models demonstrate inconsistency\nbetween their ability to ground phrases and solve tasks. We show how this can\nbe addressed through brute-force training on ground phrasing annotations, and\nanalyze the dynamics it creates. Code and at available at\nhttps://github.com/lil-lab/phrase_grounding.\n","authors":["Noriyuki Kojima","Hadar Averbuch-Elor","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2309.02691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00526v3","updated":"2023-09-06T03:30:14Z","published":"2023-06-01T10:28:12Z","title":"Layout and Task Aware Instruction Prompt for Zero-shot Document Image\n Question Answering","summary":" The pre-training-fine-tuning paradigm based on layout-aware multimodal\npre-trained models has achieved significant progress on document image question\nanswering. However, domain pre-training and task fine-tuning for additional\nvisual, layout, and task modules prevent them from directly utilizing\noff-the-shelf instruction-tuning language foundation models, which have\nrecently shown promising potential in zero-shot learning. Contrary to aligning\nlanguage models to the domain of document image question answering, we align\ndocument image question answering to off-the-shell instruction-tuning language\nfoundation models to utilize their zero-shot capability. Specifically, we\npropose layout and task aware instruction prompt called LATIN-Prompt, which\nconsists of layout-aware document content and task-aware descriptions. The\nformer recovers the layout information among text segments from OCR tools by\nappropriate spaces and line breaks. The latter ensures that the model generates\nanswers that meet the requirements, especially format requirements, through a\ndetailed description of task. Experimental results on three benchmarks show\nthat LATIN-Prompt can improve the zero-shot performance of instruction-tuning\nlanguage foundation models on document image question answering and help them\nachieve comparable levels to SOTAs based on the pre-training-fine-tuning\nparadigm. Quantitative analysis and qualitative analysis demonstrate the\neffectiveness of LATIN-Prompt. We provide the code in supplementary and will\nrelease the code to facilitate future research.\n","authors":["Wenjin Wang","Yunhao Li","Yixin Ou","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.00526v3.pdf","comment":"Add the LATIN-Tuning for Alapca. Code is available at\n https://github.com/WenjinW/LATIN-Prompt"},{"id":"http://arxiv.org/abs/2309.02681v1","updated":"2023-09-06T03:26:24Z","published":"2023-09-06T03:26:24Z","title":"Improving Image Classification of Knee Radiographs: An Automated Image\n Labeling Approach","summary":" Large numbers of radiographic images are available in knee radiology\npractices which could be used for training of deep learning models for\ndiagnosis of knee abnormalities. However, those images do not typically contain\nreadily available labels due to limitations of human annotations. The purpose\nof our study was to develop an automated labeling approach that improves the\nimage classification model to distinguish normal knee images from those with\nabnormalities or prior arthroplasty. The automated labeler was trained on a\nsmall set of labeled data to automatically label a much larger set of unlabeled\ndata, further improving the image classification performance for knee\nradiographic diagnosis. We developed our approach using 7,382 patients and\nvalidated it on a separate set of 637 patients. The final image classification\nmodel, trained using both manually labeled and pseudo-labeled data, had the\nhigher weighted average AUC (WAUC: 0.903) value and higher AUC-ROC values among\nall classes (normal AUC-ROC: 0.894; abnormal AUC-ROC: 0.896, arthroplasty\nAUC-ROC: 0.990) compared to the baseline model (WAUC=0.857; normal AUC-ROC:\n0.842; abnormal AUC-ROC: 0.848, arthroplasty AUC-ROC: 0.987), trained using\nonly manually labeled data. DeLong tests show that the improvement is\nsignificant on normal (p-value<0.002) and abnormal (p-value<0.001) images. Our\nfindings demonstrated that the proposed automated labeling approach\nsignificantly improves the performance of image classification for radiographic\nknee diagnosis, allowing for facilitating patient care and curation of large\nknee datasets.\n","authors":["Jikai Zhang","Carlos Santos","Christine Park","Maciej Mazurowski","Roy Colglazier"],"pdf_url":"https://arxiv.org/pdf/2309.02681v1.pdf","comment":"This is the preprint version"},{"id":"http://arxiv.org/abs/2309.02676v1","updated":"2023-09-06T03:07:43Z","published":"2023-09-06T03:07:43Z","title":"Efficient Training for Visual Tracking with Deformable Transformer","summary":" Recent Transformer-based visual tracking models have showcased superior\nperformance. Nevertheless, prior works have been resource-intensive, requiring\nprolonged GPU training hours and incurring high GFLOPs during inference due to\ninefficient training methods and convolution-based target heads. This intensive\nresource use renders them unsuitable for real-world applications. In this\npaper, we present DETRack, a streamlined end-to-end visual object tracking\nframework. Our framework utilizes an efficient encoder-decoder structure where\nthe deformable transformer decoder acting as a target head, achieves higher\nsparsity than traditional convolution heads, resulting in decreased GFLOPs. For\ntraining, we introduce a novel one-to-many label assignment and an auxiliary\ndenoising technique, significantly accelerating model's convergence.\nComprehensive experiments affirm the effectiveness and efficiency of our\nproposed method. For instance, DETRack achieves 72.9% AO on challenging GOT-10k\nbenchmarks using only 20% of the training epochs required by the baseline, and\nruns with lower GFLOPs than all the transformer-based trackers.\n","authors":["Qingmao Wei","Guotian Zeng","Bi Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.02676v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.16580 by other authors"},{"id":"http://arxiv.org/abs/2210.12971v2","updated":"2023-09-06T03:06:11Z","published":"2022-10-24T06:39:32Z","title":"Holistically-Attracted Wireframe Parsing: From Supervised to\n Self-Supervised Learning","summary":" This article presents Holistically-Attracted Wireframe Parsing (HAWP), a\nmethod for geometric analysis of 2D images containing wireframes formed by line\nsegments and junctions. HAWP utilizes a parsimonious Holistic Attraction (HAT)\nfield representation that encodes line segments using a closed-form 4D\ngeometric vector field. The proposed HAWP consists of three sequential\ncomponents empowered by end-to-end and HAT-driven designs: (1) generating a\ndense set of line segments from HAT fields and endpoint proposals from\nheatmaps, (2) binding the dense line segments to sparse endpoint proposals to\nproduce initial wireframes, and (3) filtering false positive proposals through\na novel endpoint-decoupled line-of-interest aligning (EPD LOIAlign) module that\ncaptures the co-occurrence between endpoint proposals and HAT fields for better\nverification. Thanks to our novel designs, HAWPv2 shows strong performance in\nfully supervised learning, while HAWPv3 excels in self-supervised learning,\nachieving superior repeatability scores and efficient training (24 GPU hours on\na single GPU). Furthermore, HAWPv3 exhibits a promising potential for wireframe\nparsing in out-of-distribution images without providing ground truth labels of\nwireframes.\n","authors":["Nan Xue","Tianfu Wu","Song Bai","Fu-Dong Wang","Gui-Song Xia","Liangpei Zhang","Philip H. S. Torr"],"pdf_url":"https://arxiv.org/pdf/2210.12971v2.pdf","comment":"Journal extension of arXiv:2003.01663; Accepted by IEEE TPAMI; Code\n is available at https://github.com/cherubicxn/hawp"},{"id":"http://arxiv.org/abs/2309.02670v1","updated":"2023-09-06T02:39:35Z","published":"2023-09-06T02:39:35Z","title":"Progressive Attention Guidance for Whole Slide Vulvovaginal Candidiasis\n Screening","summary":" Vulvovaginal candidiasis (VVC) is the most prevalent human candidal\ninfection, estimated to afflict approximately 75% of all women at least once in\ntheir lifetime. It will lead to several symptoms including pruritus, vaginal\nsoreness, and so on. Automatic whole slide image (WSI) classification is highly\ndemanded, for the huge burden of disease control and prevention. However, the\nWSI-based computer-aided VCC screening method is still vacant due to the scarce\nlabeled data and unique properties of candida. Candida in WSI is challenging to\nbe captured by conventional classification models due to its distinctive\nelongated shape, the small proportion of their spatial distribution, and the\nstyle gap from WSIs. To make the model focus on the candida easier, we propose\nan attention-guided method, which can obtain a robust diagnosis classification\nmodel. Specifically, we first use a pre-trained detection model as prior\ninstruction to initialize the classification model. Then we design a Skip\nSelf-Attention module to refine the attention onto the fined-grained features\nof candida. Finally, we use a contrastive learning method to alleviate the\noverfitting caused by the style gap of WSIs and suppress the attention to false\npositive regions. Our experimental results demonstrate that our framework\nachieves state-of-the-art performance. Code and example data are available at\nhttps://github.com/cjdbehumble/MICCAI2023-VVC-Screening.\n","authors":["Jiangdong Cai","Honglin Xiong","Maosong Cao","Luyan Liu","Lichi Zhang","Qian Wang"],"pdf_url":"https://arxiv.org/pdf/2309.02670v1.pdf","comment":"Accepted in the main conference MICCAI 2023"},{"id":"http://arxiv.org/abs/2208.05318v2","updated":"2023-09-06T02:29:01Z","published":"2022-08-10T12:55:56Z","title":"Generative Action Description Prompts for Skeleton-based Action\n Recognition","summary":" Skeleton-based action recognition has recently received considerable\nattention. Current approaches to skeleton-based action recognition are\ntypically formulated as one-hot classification tasks and do not fully exploit\nthe semantic relations between actions. For example, \"make victory sign\" and\n\"thumb up\" are two actions of hand gestures, whose major difference lies in the\nmovement of hands. This information is agnostic from the categorical one-hot\nencoding of action classes but could be unveiled from the action description.\nTherefore, utilizing action description in training could potentially benefit\nrepresentation learning. In this work, we propose a Generative\nAction-description Prompts (GAP) approach for skeleton-based action\nrecognition. More specifically, we employ a pre-trained large-scale language\nmodel as the knowledge engine to automatically generate text descriptions for\nbody parts movements of actions, and propose a multi-modal training scheme by\nutilizing the text encoder to generate feature vectors for different body parts\nand supervise the skeleton encoder for action representation learning.\nExperiments show that our proposed GAP method achieves noticeable improvements\nover various baseline models without extra computation cost at inference. GAP\nachieves new state-of-the-arts on popular skeleton-based action recognition\nbenchmarks, including NTU RGB+D, NTU RGB+D 120 and NW-UCLA. The source code is\navailable at https://github.com/MartinXM/GAP.\n","authors":["Wangmeng Xiang","Chao Li","Yuxuan Zhou","Biao Wang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.05318v2.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2309.02666v1","updated":"2023-09-06T02:25:36Z","published":"2023-09-06T02:25:36Z","title":"Fast and Resource-Efficient Object Tracking on Edge Devices: A\n Measurement Study","summary":" Object tracking is an important functionality of edge video analytic systems\nand services. Multi-object tracking (MOT) detects the moving objects and tracks\ntheir locations frame by frame as real scenes are being captured into a video.\nHowever, it is well known that real time object tracking on the edge poses\ncritical technical challenges, especially with edge devices of heterogeneous\ncomputing resources. This paper examines the performance issues and\nedge-specific optimization opportunities for object tracking. We will show that\neven the well trained and optimized MOT model may still suffer from random\nframe dropping problems when edge devices have insufficient computation\nresources. We present several edge specific performance optimization\nstrategies, collectively coined as EMO, to speed up the real time object\ntracking, ranging from window-based optimization to similarity based\noptimization. Extensive experiments on popular MOT benchmarks demonstrate that\nour EMO approach is competitive with respect to the representative methods for\non-device object tracking techniques in terms of run-time performance and\ntracking accuracy. EMO is released on Github at\nhttps://github.com/git-disl/EMO.\n","authors":["Sanjana Vijay Ganesh","Yanzhao Wu","Gaowen Liu","Ramana Kompella","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01365v2","updated":"2023-09-06T02:18:23Z","published":"2023-09-04T05:25:10Z","title":"Refined Temporal Pyramidal Compression-and-Amplification Transformer for\n 3D Human Pose Estimation","summary":" Accurately estimating the 3D pose of humans in video sequences requires both\naccuracy and a well-structured architecture. With the success of transformers,\nwe introduce the Refined Temporal Pyramidal Compression-and-Amplification\n(RTPCA) transformer. Exploiting the temporal dimension, RTPCA extends\nintra-block temporal modeling via its Temporal Pyramidal\nCompression-and-Amplification (TPCA) structure and refines inter-block feature\ninteraction with a Cross-Layer Refinement (XLR) module. In particular, TPCA\nblock exploits a temporal pyramid paradigm, reinforcing key and value\nrepresentation capabilities and seamlessly extracting spatial semantics from\nmotion sequences. We stitch these TPCA blocks with XLR that promotes rich\nsemantic representation through continuous interaction of queries, keys, and\nvalues. This strategy embodies early-stage information with current flows,\naddressing typical deficits in detail and stability seen in other\ntransformer-based methods. We demonstrate the effectiveness of RTPCA by\nachieving state-of-the-art results on Human3.6M, HumanEva-I, and MPI-INF-3DHP\nbenchmarks with minimal computational overhead. The source code is available at\nhttps://github.com/hbing-l/RTPCA.\n","authors":["Hanbing Liu","Wangmeng Xiang","Jun-Yan He","Zhi-Qi Cheng","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2309.01365v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.01811v2","updated":"2023-09-06T02:10:37Z","published":"2023-09-04T21:01:55Z","title":"Instant Continual Learning of Neural Radiance Fields","summary":" Neural radiance fields (NeRFs) have emerged as an effective method for\nnovel-view synthesis and 3D scene reconstruction. However, conventional\ntraining methods require access to all training views during scene\noptimization. This assumption may be prohibitive in continual learning\nscenarios, where new data is acquired in a sequential manner and a continuous\nupdate of the NeRF is desired, as in automotive or remote sensing applications.\nWhen naively trained in such a continual setting, traditional scene\nrepresentation frameworks suffer from catastrophic forgetting, where previously\nlearned knowledge is corrupted after training on new data. Prior works in\nalleviating forgetting with NeRFs suffer from low reconstruction quality and\nhigh latency, making them impractical for real-world application. We propose a\ncontinual learning framework for training NeRFs that leverages replay-based\nmethods combined with a hybrid explicit--implicit scene representation. Our\nmethod outperforms previous methods in reconstruction quality when trained in a\ncontinual setting, while having the additional benefit of being an order of\nmagnitude faster.\n","authors":["Ryan Po","Zhengyang Dong","Alexander W. Bergman","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2309.01811v2.pdf","comment":"For project page please visit https://ryanpo.com/icngp/"},{"id":"http://arxiv.org/abs/2303.00973v2","updated":"2023-09-06T01:48:56Z","published":"2023-03-02T05:10:57Z","title":"Image Labels Are All You Need for Coarse Seagrass Segmentation","summary":" Seagrass meadows serve as critical carbon sinks, but estimating the amount of\ncarbon they store requires knowledge of the seagrass species present.\nUnderwater and surface vehicles equipped with machine learning algorithms can\nhelp to accurately estimate the composition and extent of seagrass meadows at\nscale. However, previous approaches for seagrass detection and classification\nhave required supervision from patch-level labels. In this paper, we reframe\nseagrass classification as a weakly supervised coarse segmentation problem\nwhere image-level labels are used during training (25 times fewer labels\ncompared to patch-level labeling) and patch-level outputs are obtained at\ninference time. To this end, we introduce SeaFeats, an architecture that uses\nunsupervised contrastive pre-training and feature similarity, and SeaCLIP, a\nmodel that showcases the effectiveness of large language models as a\nsupervisory signal in domain-specific applications. We demonstrate that an\nensemble of SeaFeats and SeaCLIP leads to highly robust performance. Our method\noutperforms previous approaches that require patch-level labels on the\nmulti-species 'DeepSeagrass' dataset by 6.8% (absolute) for the class-weighted\nF1 score, and by 12.1% (absolute) for the seagrass presence/absence F1 score on\nthe 'Global Wetlands' dataset. We also present two case studies for real-world\ndeployment: outlier detection on the Global Wetlands dataset, and application\nof our method on imagery collected by the FloatyBoat autonomous surface\nvehicle.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2303.00973v2.pdf","comment":"10 pages, 4 figures, additional 3 pages of supplementary material"},{"id":"http://arxiv.org/abs/2309.02636v1","updated":"2023-09-06T00:56:24Z","published":"2023-09-06T00:56:24Z","title":"Multiclass Alignment of Confidence and Certainty for Network Calibration","summary":" Deep neural networks (DNNs) have made great strides in pushing the\nstate-of-the-art in several challenging domains. Recent studies reveal that\nthey are prone to making overconfident predictions. This greatly reduces the\noverall trust in model predictions, especially in safety-critical applications.\nEarly work in improving model calibration employs post-processing techniques\nwhich rely on limited parameters and require a hold-out set. Some recent\ntrain-time calibration methods, which involve all model parameters, can\noutperform the postprocessing methods. To this end, we propose a new train-time\ncalibration method, which features a simple, plug-and-play auxiliary loss known\nas multi-class alignment of predictive mean confidence and predictive certainty\n(MACC). It is based on the observation that a model miscalibration is directly\nrelated to its predictive certainty, so a higher gap between the mean\nconfidence and certainty amounts to a poor calibration both for in-distribution\nand out-of-distribution predictions. Armed with this insight, our proposed loss\nexplicitly encourages a confident (or underconfident) model to also provide a\nlow (or high) spread in the presoftmax distribution. Extensive experiments on\nten challenging datasets, covering in-domain, out-domain, non-visual\nrecognition and medical image classification scenarios, show that our method\nachieves state-of-the-art calibration performance for both in-domain and\nout-domain predictions. Our code and models will be publicly released.\n","authors":["Vinith Kugathasan","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2309.02636v1.pdf","comment":"Accepted at GCPR 2023"},{"id":"http://arxiv.org/abs/2309.03406v1","updated":"2023-09-06T23:49:11Z","published":"2023-09-06T23:49:11Z","title":"Distribution-Aware Prompt Tuning for Vision-Language Models","summary":" Pre-trained vision-language models (VLMs) have shown impressive performance\non various downstream tasks by utilizing knowledge learned from large data. In\ngeneral, the performance of VLMs on target tasks can be further improved by\nprompt tuning, which adds context to the input image or text. By leveraging\ndata from target tasks, various prompt-tuning methods have been studied in the\nliterature. A key to prompt tuning is the feature space alignment between two\nmodalities via learnable vectors with model parameters fixed. We observed that\nthe alignment becomes more effective when embeddings of each modality are\n`well-arranged' in the latent space. Inspired by this observation, we proposed\ndistribution-aware prompt tuning (DAPT) for vision-language models, which is\nsimple yet effective. Specifically, the prompts are learned by maximizing\ninter-dispersion, the distance between classes, as well as minimizing the\nintra-dispersion measured by the distance between embeddings from the same\nclass. Our extensive experiments on 11 benchmark datasets demonstrate that our\nmethod significantly improves generalizability. The code is available at\nhttps://github.com/mlvlab/DAPT.\n","authors":["Eulrang Cho","Jooyeon Kim","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2309.03406v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2309.03401v1","updated":"2023-09-06T23:35:55Z","published":"2023-09-06T23:35:55Z","title":"Reasonable Anomaly Detection in Long Sequences","summary":" Video anomaly detection is a challenging task due to the lack in approaches\nfor representing samples. The visual representations of most existing\napproaches are limited by short-term sequences of observations which cannot\nprovide enough clues for achieving reasonable detections. In this paper, we\npropose to completely represent the motion patterns of objects by learning from\nlong-term sequences. Firstly, a Stacked State Machine (SSM) model is proposed\nto represent the temporal dependencies which are consistent across long-range\nobservations. Then SSM model functions in predicting future states based on\npast ones, the divergence between the predictions with inherent normal patterns\nand observed ones determines anomalies which violate normal motion patterns.\nExtensive experiments are carried out to evaluate the proposed approach on the\ndataset and existing ones. Improvements over state-of-the-art methods can be\nobserved. Our code is available at\nhttps://github.com/AllenYLJiang/Anomaly-Detection-in-Sequences.\n","authors":["Yalong Jiang","Changkang Li"],"pdf_url":"https://arxiv.org/pdf/2309.03401v1.pdf","comment":"8 pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.03390v1","updated":"2023-09-06T22:50:50Z","published":"2023-09-06T22:50:50Z","title":"A novel method for iris recognition using BP neural network and parallel\n computing by the aid of GPUs (Graphics Processing Units)","summary":" In this paper, we seek a new method in designing an iris recognition system.\nIn this method, first the Haar wavelet features are extracted from iris images.\nThe advantage of using these features is the high-speed extraction, as well as\nbeing unique to each iris. Then the back propagation neural network (BPNN) is\nused as a classifier. In this system, the BPNN parallel algorithms and their\nimplementation on GPUs have been used by the aid of CUDA in order to speed up\nthe learning process. Finally, the system performance and the speeding outcomes\nin a way that this algorithm is done in series are presented.\n","authors":["Farahnaz Hosseini","Hossein Ebrahimpour","Samaneh Askari"],"pdf_url":"https://arxiv.org/pdf/2309.03390v1.pdf","comment":"8 pages,"},{"id":"http://arxiv.org/abs/2309.03383v1","updated":"2023-09-06T22:04:07Z","published":"2023-09-06T22:04:07Z","title":"Kidney abnormality segmentation in thorax-abdomen CT scans","summary":" In this study, we introduce a deep learning approach for segmenting kidney\nparenchyma and kidney abnormalities to support clinicians in identifying and\nquantifying renal abnormalities such as cysts, lesions, masses, metastases, and\nprimary tumors. Our end-to-end segmentation method was trained on 215\ncontrast-enhanced thoracic-abdominal CT scans, with half of these scans\ncontaining one or more abnormalities.\n We began by implementing our own version of the original 3D U-Net network and\nincorporated four additional components: an end-to-end multi-resolution\napproach, a set of task-specific data augmentations, a modified loss function\nusing top-$k$, and spatial dropout. Furthermore, we devised a tailored\npost-processing strategy. Ablation studies demonstrated that each of the four\nmodifications enhanced kidney abnormality segmentation performance, while three\nout of four improved kidney parenchyma segmentation. Subsequently, we trained\nthe nnUNet framework on our dataset. By ensembling the optimized 3D U-Net and\nthe nnUNet with our specialized post-processing, we achieved marginally\nsuperior results.\n Our best-performing model attained Dice scores of 0.965 and 0.947 for\nsegmenting kidney parenchyma in two test sets (20 scans without abnormalities\nand 30 with abnormalities), outperforming an independent human observer who\nscored 0.944 and 0.925, respectively. In segmenting kidney abnormalities within\nthe 30 test scans containing them, the top-performing method achieved a Dice\nscore of 0.585, while an independent second human observer reached a score of\n0.664, suggesting potential for further improvement in computerized methods.\n All training data is available to the research community under a CC-BY 4.0\nlicense on https://doi.org/10.5281/zenodo.8014289\n","authors":["Gabriel Efrain Humpire Mamani","Nikolas Lessmann","Ernst Th. Scholten","Mathias Prokop","Colin Jacobs","Bram van Ginneken"],"pdf_url":"https://arxiv.org/pdf/2309.03383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03381v1","updated":"2023-09-06T21:58:58Z","published":"2023-09-06T21:58:58Z","title":"Active shooter detection and robust tracking utilizing supplemental\n synthetic data","summary":" The increasing concern surrounding gun violence in the United States has led\nto a focus on developing systems to improve public safety. One approach to\ndeveloping such a system is to detect and track shooters, which would help\nprevent or mitigate the impact of violent incidents. In this paper, we proposed\ndetecting shooters as a whole, rather than just guns, which would allow for\nimproved tracking robustness, as obscuring the gun would no longer cause the\nsystem to lose sight of the threat. However, publicly available data on\nshooters is much more limited and challenging to create than a gun dataset\nalone. Therefore, we explore the use of domain randomization and transfer\nlearning to improve the effectiveness of training with synthetic data obtained\nfrom Unreal Engine environments. This enables the model to be trained on a\nwider range of data, increasing its ability to generalize to different\nsituations. Using these techniques with YOLOv8 and Deep OC-SORT, we implemented\nan initial version of a shooter tracking system capable of running on edge\nhardware, including both a Raspberry Pi and a Jetson Nano.\n","authors":["Joshua R. Waite","Jiale Feng","Riley Tavassoli","Laura Harris","Sin Yong Tan","Subhadeep Chakraborty","Soumik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2309.03381v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2302.03744v3","updated":"2023-09-06T21:38:20Z","published":"2023-02-07T20:48:35Z","title":"3D Neural Embedding Likelihood: Probabilistic Inverse Graphics for\n Robust 6D Pose Estimation","summary":" The ability to perceive and understand 3D scenes is crucial for many\napplications in computer vision and robotics. Inverse graphics is an appealing\napproach to 3D scene understanding that aims to infer the 3D scene structure\nfrom 2D images. In this paper, we introduce probabilistic modeling to the\ninverse graphics framework to quantify uncertainty and achieve robustness in 6D\npose estimation tasks. Specifically, we propose 3D Neural Embedding Likelihood\n(3DNEL) as a unified probabilistic model over RGB-D images, and develop\nefficient inference procedures on 3D scene descriptions. 3DNEL effectively\ncombines learned neural embeddings from RGB with depth information to improve\nrobustness in sim-to-real 6D object pose estimation from RGB-D images.\nPerformance on the YCB-Video dataset is on par with state-of-the-art yet is\nmuch more robust in challenging regimes. In contrast to discriminative\napproaches, 3DNEL's probabilistic generative formulation jointly models\nmultiple objects in a scene, quantifies uncertainty in a principled way, and\nhandles object pose tracking under heavy occlusion. Finally, 3DNEL provides a\nprincipled framework for incorporating prior knowledge about the scene and\nobjects, which allows natural extension to additional tasks like camera pose\ntracking from video.\n","authors":["Guangyao Zhou","Nishad Gothoskar","Lirui Wang","Joshua B. Tenenbaum","Dan Gutfreund","Miguel Lázaro-Gredilla","Dileep George","Vikash K. Mansinghka"],"pdf_url":"https://arxiv.org/pdf/2302.03744v3.pdf","comment":"ICCV 2023 camera ready"},{"id":"http://arxiv.org/abs/2303.09681v4","updated":"2023-09-06T21:34:59Z","published":"2023-03-16T22:56:12Z","title":"Event-based Human Pose Tracking by Spiking Spatiotemporal Transformer","summary":" Event camera, as an emerging biologically-inspired vision sensor for\ncapturing motion dynamics, presents new potential for 3D human pose tracking,\nor video-based 3D human pose estimation. However, existing works in pose\ntracking either require the presence of additional gray-scale images to\nestablish a solid starting pose, or ignore the temporal dependencies all\ntogether by collapsing segments of event streams to form static event frames.\nMeanwhile, although the effectiveness of Artificial Neural Networks (ANNs,\na.k.a. dense deep learning) has been showcased in many event-based tasks, the\nuse of ANNs tends to neglect the fact that compared to the dense frame-based\nimage sequences, the occurrence of events from an event camera is\nspatiotemporally much sparser. Motivated by the above mentioned issues, we\npresent in this paper a dedicated end-to-end sparse deep learning approach for\nevent-based pose tracking: 1) to our knowledge this is the first time that 3D\nhuman pose tracking is obtained from events only, thus eliminating the need of\naccessing to any frame-based images as part of input; 2) our approach is based\nentirely upon the framework of Spiking Neural Networks (SNNs), which consists\nof Spike-Element-Wise (SEW) ResNet and a novel Spiking Spatiotemporal\nTransformer; 3) a large-scale synthetic dataset is constructed that features a\nbroad and diverse set of annotated 3D human motions, as well as longer hours of\nevent stream data, named SynEventHPD. Empirical experiments demonstrate that,\nwith superior performance over the state-of-the-art (SOTA) ANNs counterparts,\nour approach also achieves a significant computation reduction of 80% in FLOPS.\nFurthermore, our proposed method also outperforms SOTA SNNs in the regression\ntask of human pose tracking. Our implementation is available at\nhttps://github.com/JimmyZou/HumanPoseTracking_SNN and dataset will be released\nupon paper acceptance.\n","authors":["Shihao Zou","Yuxuan Mu","Xinxin Zuo","Sen Wang","Li Cheng"],"pdf_url":"https://arxiv.org/pdf/2303.09681v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03367v1","updated":"2023-09-06T21:20:10Z","published":"2023-09-06T21:20:10Z","title":"Self-Supervised Masked Digital Elevation Models Encoding for\n Low-Resource Downstream Tasks","summary":" The lack of quality labeled data is one of the main bottlenecks for training\nDeep Learning models. As the task increases in complexity, there is a higher\npenalty for overfitting and unstable learning. The typical paradigm employed\ntoday is Self-Supervised learning, where the model attempts to learn from a\nlarge corpus of unstructured and unlabeled data and then transfer that\nknowledge to the required task. Some notable examples of self-supervision in\nother modalities are BERT for Large Language Models, Wav2Vec for Speech\nRecognition, and the Masked AutoEncoder for Vision, which all utilize\nTransformers to solve a masked prediction task. GeoAI is uniquely poised to\ntake advantage of the self-supervised methodology due to the decades of data\ncollected, little of which is precisely and dependably annotated. Our goal is\nto extract building and road segmentations from Digital Elevation Models (DEM)\nthat provide a detailed topography of the earths surface. The proposed\narchitecture is the Masked Autoencoder pre-trained on ImageNet (with the\nlimitation that there is a large domain discrepancy between ImageNet and DEM)\nwith an UperNet Head for decoding segmentations. We tested this model with 450\nand 50 training images only, utilizing roughly 5% and 0.5% of the original data\nrespectively. On the building segmentation task, this model obtains an 82.1%\nIntersection over Union (IoU) with 450 Images and 69.1% IoU with only 50\nimages. On the more challenging road detection task the model obtains an 82.7%\nIoU with 450 images and 73.2% IoU with only 50 images. Any hand-labeled dataset\nmade today about the earths surface will be immediately obsolete due to the\nconstantly changing nature of the landscape. This motivates the clear necessity\nfor data-efficient learners that can be used for a wide variety of downstream\ntasks.\n","authors":["Priyam Mazumdar","Aiman Soliman","Volodymyr Kindratenko","Luigi Marini","Kenton McHenry"],"pdf_url":"https://arxiv.org/pdf/2309.03367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03360v1","updated":"2023-09-06T21:04:53Z","published":"2023-09-06T21:04:53Z","title":"ViewMix: Augmentation for Robust Representation in Self-Supervised\n Learning","summary":" Joint Embedding Architecture-based self-supervised learning methods have\nattributed the composition of data augmentations as a crucial factor for their\nstrong representation learning capabilities. While regional dropout strategies\nhave proven to guide models to focus on lesser indicative parts of the objects\nin supervised methods, it hasn't been adopted by self-supervised methods for\ngenerating positive pairs. This is because the regional dropout methods are not\nsuitable for the input sampling process of the self-supervised methodology.\nWhereas dropping informative pixels from the positive pairs can result in\ninefficient training, replacing patches of a specific object with a different\none can steer the model from maximizing the agreement between different\npositive pairs. Moreover, joint embedding representation learning methods have\nnot made robustness their primary training outcome. To this end, we propose the\nViewMix augmentation policy, specially designed for self-supervised learning,\nupon generating different views of the same image, patches are cut and pasted\nfrom one view to another. By leveraging the different views created by this\naugmentation strategy, multiple joint embedding-based self-supervised\nmethodologies obtained better localization capability and consistently\noutperformed their corresponding baseline methods. It is also demonstrated that\nincorporating ViewMix augmentation policy promotes robustness of the\nrepresentations in the state-of-the-art methods. Furthermore, our\nexperimentation and analysis of compute times suggest that ViewMix augmentation\ndoesn't introduce any additional overhead compared to other counterparts.\n","authors":["Arjon Das","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2309.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01188v4","updated":"2023-09-06T20:57:32Z","published":"2023-06-01T22:57:32Z","title":"Event-based Stereo Visual Odometry with Native Temporal Resolution via\n Continuous-time Gaussian Process Regression","summary":" Event-based cameras asynchronously capture individual visual changes in a\nscene. This makes them more robust than traditional frame-based cameras to\nhighly dynamic motions and poor illumination. It also means that every\nmeasurement in a scene can occur at a unique time.\n Handling these different measurement times is a major challenge of using\nevent-based cameras. It is often addressed in visual odometry (VO) pipelines by\napproximating temporally close measurements as occurring at one common time.\nThis grouping simplifies the estimation problem but, absent additional sensors,\nsacrifices the inherent temporal resolution of event-based cameras.\n This paper instead presents a complete stereo VO pipeline that estimates\ndirectly with individual event-measurement times without requiring any grouping\nor approximation in the estimation state. It uses continuous-time trajectory\nestimation to maintain the temporal fidelity and asynchronous nature of\nevent-based cameras through Gaussian process regression with a physically\nmotivated prior. Its performance is evaluated on the MVSEC dataset, where it\nachieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences,\noutperforming the existing publicly available event-based stereo VO pipeline by\ntwo and four times, respectively.\n","authors":["Jianeng Wang","Jonathan D. Gammell"],"pdf_url":"https://arxiv.org/pdf/2306.01188v4.pdf","comment":"To appear in IEEE Robotics and Automation Letters (RA-L). 8 pages, 4\n figures. DOI: 10.1109/LRA.2023.3311374"},{"id":"http://arxiv.org/abs/2309.03353v1","updated":"2023-09-06T20:36:17Z","published":"2023-09-06T20:36:17Z","title":"Source Camera Identification and Detection in Digital Videos through\n Blind Forensics","summary":" Source camera identification in digital videos is the problem of associating\nan unknown digital video with its source device, within a closed set of\npossible devices. The existing techniques in source detection of digital videos\ntry to find a fingerprint of the actual source in the video in form of PRNU\n(Photo Response Non--Uniformity), and match it against the SPN (Sensor Pattern\nNoise) of each possible device. The highest correlation indicates the correct\nsource. We investigate the problem of identifying a video source through a\nfeature based approach using machine learning. In this paper, we present a\nblind forensic technique of video source authentication and identification,\nbased on feature extraction, feature selection and subsequent source\nclassification. The main aim is to determine whether a claimed source for a\nvideo is actually its original source. If not, we identify its original source.\nOur experimental results prove the efficiency of the proposed method compared\nto traditional fingerprint based technique.\n","authors":["Venkata Udaya Sameer","Shilpa Mukhopadhyay","Ruchira Naskar","Ishaan Dali"],"pdf_url":"https://arxiv.org/pdf/2309.03353v1.pdf","comment":"Submitted to IEEE for inclusion in Xplore- Digital Library. Paper\n presented at the International Conference on Recent Trends in Computational\n Engineering & Technologies (ICRTCET 18)with Paper Id: ICRTCET-227"},{"id":"http://arxiv.org/abs/2309.03351v1","updated":"2023-09-06T20:24:13Z","published":"2023-09-06T20:24:13Z","title":"Using Neural Networks for Fast SAR Roughness Estimation of High\n Resolution Images","summary":" The analysis of Synthetic Aperture Radar (SAR) imagery is an important step\nin remote sensing applications, and it is a challenging problem due to its\ninherent speckle noise. One typical solution is to model the data using the\n$G_I^0$ distribution and extract its roughness information, which in turn can\nbe used in posterior imaging tasks, such as segmentation, classification and\ninterpretation. This leads to the need of quick and reliable estimation of the\nroughness parameter from SAR data, especially with high resolution images.\nUnfortunately, traditional parameter estimation procedures are slow and prone\nto estimation failures. In this work, we proposed a neural network-based\nestimation framework that first learns how to predict underlying parameters of\n$G_I^0$ samples and then can be used to estimate the roughness of unseen data.\nWe show that this approach leads to an estimator that is quicker, yields less\nestimation error and is less prone to failures than the traditional estimation\nprocedures for this problem, even when we use a simple network. More\nimportantly, we show that this same methodology can be generalized to handle\nimage inputs and, even if trained on purely synthetic data for a few seconds,\nis able to perform real time pixel-wise roughness estimation for high\nresolution real SAR imagery.\n","authors":["Li Fan","Jeova Farias Sales Rocha Neto"],"pdf_url":"https://arxiv.org/pdf/2309.03351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13166v2","updated":"2023-09-06T20:13:34Z","published":"2023-06-22T19:06:27Z","title":"A Sparse Graph Formulation for Efficient Spectral Image Segmentation","summary":" Spectral Clustering is one of the most traditional methods to solve\nsegmentation problems. Based on Normalized Cuts, it aims at partitioning an\nimage using an objective function defined by a graph. Despite their\nmathematical attractiveness, spectral approaches are traditionally neglected by\nthe scientific community due to their practical issues and underperformance. In\nthis paper, we adopt a sparse graph formulation based on the inclusion of extra\nnodes to a simple grid graph. While the grid encodes the pixel spatial\ndisposition, the extra nodes account for the pixel color data. Applying the\noriginal Normalized Cuts algorithm to this graph leads to a simple and scalable\nmethod for spectral image segmentation, with an interpretable solution. Our\nexperiments also demonstrate that our proposed methodology over performs both\ntraditional and modern unsupervised algorithms for segmentation in both real\nand synthetic data.\n","authors":["Rahul Palnitkar","Jeova Farias Sales Rocha Neto"],"pdf_url":"https://arxiv.org/pdf/2306.13166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03335v1","updated":"2023-09-06T19:30:22Z","published":"2023-09-06T19:30:22Z","title":"SADIR: Shape-Aware Diffusion Models for 3D Image Reconstruction","summary":" 3D image reconstruction from a limited number of 2D images has been a\nlong-standing challenge in computer vision and image analysis. While deep\nlearning-based approaches have achieved impressive performance in this area,\nexisting deep networks often fail to effectively utilize the shape structures\nof objects presented in images. As a result, the topology of reconstructed\nobjects may not be well preserved, leading to the presence of artifacts such as\ndiscontinuities, holes, or mismatched connections between different parts. In\nthis paper, we propose a shape-aware network based on diffusion models for 3D\nimage reconstruction, named SADIR, to address these issues. In contrast to\nprevious methods that primarily rely on spatial correlations of image\nintensities for 3D reconstruction, our model leverages shape priors learned\nfrom the training data to guide the reconstruction process. To achieve this, we\ndevelop a joint learning network that simultaneously learns a mean shape under\ndeformation models. Each reconstructed image is then considered as a deformed\nvariant of the mean shape. We validate our model, SADIR, on both brain and\ncardiac magnetic resonance images (MRIs). Experimental results show that our\nmethod outperforms the baselines with lower reconstruction error and better\npreservation of the shape structure of objects within the images.\n","authors":["Nivetha Jayakumar","Tonmoy Hossain","Miaomiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.03335v1.pdf","comment":"ShapeMI MICCAI 2023: Workshop on Shape in Medical Imaging"},{"id":"http://arxiv.org/abs/2309.03331v1","updated":"2023-09-06T19:19:41Z","published":"2023-09-06T19:19:41Z","title":"Expert Uncertainty and Severity Aware Chest X-Ray Classification by\n Multi-Relationship Graph Learning","summary":" Patients undergoing chest X-rays (CXR) often endure multiple lung diseases.\nWhen evaluating a patient's condition, due to the complex pathologies, subtle\ntexture changes of different lung lesions in images, and patient condition\ndifferences, radiologists may make uncertain even when they have experienced\nlong-term clinical training and professional guidance, which makes much noise\nin extracting disease labels based on CXR reports. In this paper, we re-extract\ndisease labels from CXR reports to make them more realistic by considering\ndisease severity and uncertainty in classification. Our contributions are as\nfollows: 1. We re-extracted the disease labels with severity and uncertainty by\na rule-based approach with keywords discussed with clinical experts. 2. To\nfurther improve the explainability of chest X-ray diagnosis, we designed a\nmulti-relationship graph learning method with an expert uncertainty-aware loss\nfunction. 3. Our multi-relationship graph learning method can also interpret\nthe disease classification results. Our experimental results show that models\nconsidering disease severity and uncertainty outperform previous\nstate-of-the-art methods.\n","authors":["Mengliang Zhang","Xinyue Hu","Lin Gu","Liangchen Liu","Kazuma Kobayashi","Tatsuya Harada","Ronald M. Summers","Yingying Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.03331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03329v1","updated":"2023-09-06T19:19:12Z","published":"2023-09-06T19:19:12Z","title":"MEGANet: Multi-Scale Edge-Guided Attention Network for Weak Boundary\n Polyp Segmentation","summary":" Efficient polyp segmentation in healthcare plays a critical role in enabling\nearly diagnosis of colorectal cancer. However, the segmentation of polyps\npresents numerous challenges, including the intricate distribution of\nbackgrounds, variations in polyp sizes and shapes, and indistinct boundaries.\nDefining the boundary between the foreground (i.e. polyp itself) and the\nbackground (surrounding tissue) is difficult. To mitigate these challenges, we\npropose Multi-Scale Edge-Guided Attention Network (MEGANet) tailored\nspecifically for polyp segmentation within colonoscopy images. This network\ndraws inspiration from the fusion of a classical edge detection technique with\nan attention mechanism. By combining these techniques, MEGANet effectively\npreserves high-frequency information, notably edges and boundaries, which tend\nto erode as neural networks deepen. MEGANet is designed as an end-to-end\nframework, encompassing three key modules: an encoder, which is responsible for\ncapturing and abstracting the features from the input image, a decoder, which\nfocuses on salient features, and the Edge-Guided Attention module (EGA) that\nemploys the Laplacian Operator to accentuate polyp boundaries. Extensive\nexperiments, both qualitative and quantitative, on five benchmark datasets,\ndemonstrate that our EGANet outperforms other existing SOTA methods under six\nevaluation metrics. Our code is available at\n\\url{https://github.com/DinhHieuHoang/MEGANet}\n","authors":["Nhat-Tan Bui","Dinh-Hieu Hoang","Quang-Thuc Nguyen","Minh-Triet Tran","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2309.03329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03320v1","updated":"2023-09-06T19:01:58Z","published":"2023-09-06T19:01:58Z","title":"CoNeS: Conditional neural fields with shift modulation for\n multi-sequence MRI translation","summary":" Multi-sequence magnetic resonance imaging (MRI) has found wide applications\nin both modern clinical studies and deep learning research. However, in\nclinical practice, it frequently occurs that one or more of the MRI sequences\nare missing due to different image acquisition protocols or contrast agent\ncontraindications of patients, limiting the utilization of deep learning models\ntrained on multi-sequence data. One promising approach is to leverage\ngenerative models to synthesize the missing sequences, which can serve as a\nsurrogate acquisition. State-of-the-art methods tackling this problem are based\non convolutional neural networks (CNN) which usually suffer from spectral\nbiases, resulting in poor reconstruction of high-frequency fine details. In\nthis paper, we propose Conditional Neural fields with Shift modulation (CoNeS),\na model that takes voxel coordinates as input and learns a representation of\nthe target images for multi-sequence MRI translation. The proposed model uses a\nmulti-layer perceptron (MLP) instead of a CNN as the decoder for pixel-to-pixel\nmapping. Hence, each target image is represented as a neural field that is\nconditioned on the source image via shift modulation with a learned latent\ncode. Experiments on BraTS 2018 and an in-house clinical dataset of vestibular\nschwannoma patients showed that the proposed method outperformed\nstate-of-the-art methods for multi-sequence MRI translation both visually and\nquantitatively. Moreover, we conducted spectral analysis, showing that CoNeS\nwas able to overcome the spectral bias issue common in conventional CNN models.\nTo further evaluate the usage of synthesized images in clinical downstream\ntasks, we tested a segmentation network using the synthesized images at\ninference.\n","authors":["Yunjie Chen","Marius Staring","Olaf M. Neve","Stephan R. Romeijn","Erik F. Hensen","Berit M. Verbist","Jelmer M. Wolterink","Qian Tao"],"pdf_url":"https://arxiv.org/pdf/2309.03320v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.05059v3","updated":"2023-09-06T18:20:08Z","published":"2020-07-09T20:53:45Z","title":"Learning Representations that Support Extrapolation","summary":" Extrapolation -- the ability to make inferences that go beyond the scope of\none's experiences -- is a hallmark of human intelligence. By contrast, the\ngeneralization exhibited by contemporary neural network algorithms is largely\nlimited to interpolation between data points in their training corpora. In this\npaper, we consider the challenge of learning representations that support\nextrapolation. We introduce a novel visual analogy benchmark that allows the\ngraded evaluation of extrapolation as a function of distance from the convex\ndomain defined by the training data. We also introduce a simple technique,\ntemporal context normalization, that encourages representations that emphasize\nthe relations between objects. We find that this technique enables a\nsignificant improvement in the ability to extrapolate, considerably\noutperforming a number of competitive techniques.\n","authors":["Taylor W. Webb","Zachary Dulberg","Steven M. Frankland","Alexander A. Petrov","Randall C. O'Reilly","Jonathan D. Cohen"],"pdf_url":"https://arxiv.org/pdf/2007.05059v3.pdf","comment":"ICML 2020"},{"id":"http://arxiv.org/abs/2309.03295v1","updated":"2023-09-06T18:17:47Z","published":"2023-09-06T18:17:47Z","title":"Comparative Analysis of Deep-Fake Algorithms","summary":" Due to the widespread use of smartphones with high-quality digital cameras\nand easy access to a wide range of software apps for recording, editing, and\nsharing videos and images, as well as the deep learning AI platforms, a new\nphenomenon of 'faking' videos has emerged. Deepfake algorithms can create fake\nimages and videos that are virtually indistinguishable from authentic ones.\nTherefore, technologies that can detect and assess the integrity of digital\nvisual media are crucial. Deepfakes, also known as deep learning-based fake\nvideos, have become a major concern in recent years due to their ability to\nmanipulate and alter images and videos in a way that is virtually\nindistinguishable from the original. These deepfake videos can be used for\nmalicious purposes such as spreading misinformation, impersonating individuals,\nand creating fake news. Deepfake detection technologies use various approaches\nsuch as facial recognition, motion analysis, and audio-visual synchronization\nto identify and flag fake videos. However, the rapid advancement of deepfake\ntechnologies has made it increasingly difficult to detect these videos with\nhigh accuracy. In this paper, we aim to provide a comprehensive review of the\ncurrent state of deepfake creation and detection technologies. We examine the\nvarious deep learning-based approaches used for creating deepfakes, as well as\nthe techniques used for detecting them. Additionally, we analyze the\nlimitations and challenges of current deepfake detection methods and discuss\nfuture research directions in this field. Overall, the paper highlights the\nimportance of continued research and development in deepfake detection\ntechnologies in order to combat the negative impact of deepfakes on society and\nensure the integrity of digital visual media.\n","authors":["Nikhil Sontakke","Sejal Utekar","Shivansh Rastogi","Shriraj Sonawane"],"pdf_url":"https://arxiv.org/pdf/2309.03295v1.pdf","comment":"7 pages, 4 figures, 2 tables, Published with International Journal of\n Computer Science Trends and Technology (IJCST)"},{"id":"http://arxiv.org/abs/2309.03185v1","updated":"2023-09-06T17:44:34Z","published":"2023-09-06T17:44:34Z","title":"Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields","summary":" Neural Radiance Fields (NeRFs) have shown promise in applications like view\nsynthesis and depth estimation, but learning from multiview images faces\ninherent uncertainties. Current methods to quantify them are either heuristic\nor computationally demanding. We introduce BayesRays, a post-hoc framework to\nevaluate uncertainty in any pre-trained NeRF without modifying the training\nprocess. Our method establishes a volumetric uncertainty field using spatial\nperturbations and a Bayesian Laplace approximation. We derive our algorithm\nstatistically and show its superior performance in key metrics and\napplications. Additional results available at: https://bayesrays.github.io.\n","authors":["Lily Goli","Cody Reading","Silvia Sellán","Alec Jacobson","Andrea Tagliasacchi"],"pdf_url":"https://arxiv.org/pdf/2309.03185v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.03169v1","updated":"2023-09-06T17:09:43Z","published":"2023-09-06T17:09:43Z","title":"Impression-Informed Multi-Behavior Recommender System: A Hierarchical\n Graph Attention Approach","summary":" While recommender systems have significantly benefited from implicit\nfeedback, they have often missed the nuances of multi-behavior interactions\nbetween users and items. Historically, these systems either amalgamated all\nbehaviors, such as \\textit{impression} (formerly \\textit{view}),\n\\textit{add-to-cart}, and \\textit{buy}, under a singular 'interaction' label,\nor prioritized only the target behavior, often the \\textit{buy} action,\ndiscarding valuable auxiliary signals. Although recent advancements tried\naddressing this simplification, they primarily gravitated towards optimizing\nthe target behavior alone, battling with data scarcity. Additionally, they\ntended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these\ngaps, we introduce the \\textbf{H}ierarchical \\textbf{M}ulti-behavior\n\\textbf{G}raph Attention \\textbf{N}etwork (HMGN). This pioneering framework\nleverages attention mechanisms to discern information from both inter and\nintra-behaviors while employing a multi-task Hierarchical Bayesian Personalized\nRanking (HBPR) for optimization. Recognizing the need for scalability, our\napproach integrates a specialized multi-behavior sub-graph sampling technique.\nMoreover, the adaptability of HMGN allows for the seamless inclusion of\nknowledge metadata and time-series data. Empirical results attest to our\nmodel's prowess, registering a notable performance boost of up to 64\\% in\nNDCG@100 metrics over conventional graph neural network methods.\n","authors":["Dong Li","Divya Bhargavi","Vidya Sagar Ravipati"],"pdf_url":"https://arxiv.org/pdf/2309.03169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10807v2","updated":"2023-09-06T14:27:17Z","published":"2023-08-21T15:56:05Z","title":"DynED: Dynamic Ensemble Diversification in Data Stream Classification","summary":" Ensemble methods are commonly used in classification due to their remarkable\nperformance. Achieving high accuracy in a data stream environment is a\nchallenging task considering disruptive changes in the data distribution, also\nknown as concept drift. A greater diversity of ensemble components is known to\nenhance prediction accuracy in such settings. Despite the diversity of\ncomponents within an ensemble, not all contribute as expected to its overall\nperformance. This necessitates a method for selecting components that exhibit\nhigh performance and diversity. We present a novel ensemble construction and\nmaintenance approach based on MMR (Maximal Marginal Relevance) that dynamically\ncombines the diversity and prediction accuracy of components during the process\nof structuring an ensemble. The experimental results on both four real and 11\nsynthetic datasets demonstrate that the proposed approach (DynED) provides a\nhigher average mean accuracy compared to the five state-of-the-art baselines.\n","authors":["Soheil Abadifard","Sepehr Bakhshi","Sanaz Gheibuni","Fazli Can"],"pdf_url":"https://arxiv.org/pdf/2308.10807v2.pdf","comment":"Proceedings of the 32nd ACM International Conference on Information\n and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United\n Kingdom"},{"id":"http://arxiv.org/abs/2309.02978v1","updated":"2023-09-06T13:22:20Z","published":"2023-09-06T13:22:20Z","title":"Helper Recommendation with seniority control in Online Health Community","summary":" Online health communities (OHCs) are forums where patients with similar\nconditions communicate their experiences and provide moral support. Social\nsupport in OHCs plays a crucial role in easing and rehabilitating patients.\nHowever, many time-sensitive questions from patients often remain unanswered\ndue to the multitude of threads and the random nature of patient visits in\nOHCs. To address this issue, it is imperative to propose a recommender system\nthat assists solution seekers in finding appropriate problem helpers.\nNevertheless, developing a recommendation algorithm to enhance social support\nin OHCs remains an under-explored area. Traditional recommender systems cannot\nbe directly adapted due to the following obstacles. First, unlike user-item\nlinks in traditional recommender systems, it is hard to model the social\nsupport behind helper-seeker links in OHCs since they are formed based on\nvarious heterogeneous reasons. Second, it is difficult to distinguish the\nimpact of historical activities in characterizing patients. Third, it is\nsignificantly challenging to ensure that the recommended helpers possess\nsufficient expertise to assist the seekers. To tackle the aforementioned\nchallenges, we develop a Monotonically regularIzed diseNTangled Variational\nAutoencoders (MINT) model to strengthen social support in OHCs.\n","authors":["Junruo Gao","Chen Ling","Carl Yang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.02978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07041v2","updated":"2023-09-06T13:08:44Z","published":"2023-04-14T10:29:18Z","title":"A Diffusion model for POI recommendation","summary":" Next Point-of-Interest (POI) recommendation is a critical task in\nlocation-based services that aim to provide personalized suggestions for the\nuser's next destination. Previous works on POI recommendation have laid focused\non modeling the user's spatial preference. However, existing works that\nleverage spatial information are only based on the aggregation of users'\nprevious visited positions, which discourages the model from recommending POIs\nin novel areas. This trait of position-based methods will harm the model's\nperformance in many situations. Additionally, incorporating sequential\ninformation into the user's spatial preference remains a challenge. In this\npaper, we propose Diff-POI: a Diffusion-based model that samples the user's\nspatial preference for the next POI recommendation. Inspired by the wide\napplication of diffusion algorithm in sampling from distributions, Diff-POI\nencodes the user's visiting sequence and spatial character with two\ntailor-designed graph encoding modules, followed by a diffusion-based sampling\nstrategy to explore the user's spatial visiting trends. We leverage the\ndiffusion process and its reversed form to sample from the posterior\ndistribution and optimized the corresponding score function. We design a joint\ntraining and inference framework to optimize and evaluate the proposed\nDiff-POI. Extensive experiments on four real-world POI recommendation datasets\ndemonstrate the superiority of our Diff-POI over state-of-the-art baseline\nmethods. Further ablation and parameter studies on Diff-POI reveal the\nfunctionality and effectiveness of the proposed diffusion-based sampling\nstrategy for addressing the limitations of existing methods.\n","authors":["Yifang Qin","Hongjun Wu","Wei Ju","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.07041v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02962v1","updated":"2023-09-06T12:57:32Z","published":"2023-09-06T12:57:32Z","title":"Prompt-based Effective Input Reformulation for Legal Case Retrieval","summary":" Legal case retrieval plays an important role for legal practitioners to\neffectively retrieve relevant cases given a query case. Most existing neural\nlegal case retrieval models directly encode the whole legal text of a case to\ngenerate a case representation, which is then utilised to conduct a nearest\nneighbour search for retrieval. Although these straightforward methods have\nachieved improvement over conventional statistical methods in retrieval\naccuracy, two significant challenges are identified in this paper: (1) Legal\nfeature alignment: the usage of the whole case text as the input will generally\nincorporate redundant and noisy information because, from the legal\nperspective, the determining factor of relevant cases is the alignment of key\nlegal features instead of whole text matching; (2) Legal context preservation:\nfurthermore, since the existing text encoding models usually have an input\nlength limit shorter than the case, the whole case text needs to be truncated\nor divided into paragraphs, which leads to the loss of the global context of\nlegal information. In this paper, a novel legal case retrieval framework,\nPromptCase, is proposed to tackle these challenges. Firstly, legal facts and\nlegal issues are identified and formally defined as the key features\nfacilitating legal case retrieval based on a thorough study of the definition\nof relevant cases from a legal perspective. Secondly, with the determining\nlegal features, a prompt-based encoding scheme is designed to conduct an\neffective encoding with language models. Extensive zero-shot experiments have\nbeen conducted on two benchmark datasets in legal case retrieval, which\ndemonstrate the superior retrieval effectiveness of the proposed PromptCase.\nThe code has been released on https://github.com/yanran-tang/PromptCase.\n","authors":["Yanran Tang","Ruihong Qiu","Xue Li"],"pdf_url":"https://arxiv.org/pdf/2309.02962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04497v2","updated":"2023-09-06T12:41:48Z","published":"2023-04-10T10:22:21Z","title":"A Unified Framework for Exploratory Learning-Aided Community Detection\n in Networks with Unknown Topology","summary":" In social networks, the discovery of community structures has received\nconsiderable attention as a fundamental problem in various network analysis\ntasks. However, due to privacy concerns or access restrictions, the network\nstructure is often unknown, thereby rendering established community detection\napproaches ineffective without costly network topology acquisition. To tackle\nthis challenge, we present META-CODE, a unified framework for detecting\noverlapping communities in networks with unknown topology via exploratory\nlearning aided by easy-to-collect node metadata. Specifically, META-CODE\nconsists of three iterative steps in addition to the initial network inference\nstep: 1) node-level community-affiliation embeddings based on graph neural\nnetworks (GNNs) trained by our new reconstruction loss, 2) network exploration\nvia community-affiliation-based node queries, and 3) network inference using an\nedge connectivity-based Siamese neural network model from the explored network.\nThrough extensive experiments on five real-world datasets including two large\nnetworks, we demonstrated: (a) the superiority of META-CODE over benchmark\ncommunity detection methods, achieving remarkable gains up to 151.27% compared\nto the best existing competitor, (b) the impact of each module in META-CODE,\n(c) the effectiveness of node queries in META-CODE based on empirical\nevaluations and theoretical findings, (d) the convergence of the inferred\nnetwork, and (e) the computational efficiency of META-CODE.\n","authors":["Yu Hou","Cong Tran","Ming Li","Won-Yong Shin"],"pdf_url":"https://arxiv.org/pdf/2304.04497v2.pdf","comment":"16 pages, 9 figures, 6 tables; its conference version was presented\n at the ACM International Conference on Information and Knowledge Management\n (CIKM 2022)"},{"id":"http://arxiv.org/abs/2306.17125v2","updated":"2023-09-06T07:30:33Z","published":"2023-06-29T17:31:33Z","title":"Ducho: A Unified Framework for the Extraction of Multimodal Features in\n Recommendation","summary":" In multimodal-aware recommendation, the extraction of meaningful multimodal\nfeatures is at the basis of high-quality recommendations. Generally, each\nrecommendation framework implements its multimodal extraction procedures with\nspecific strategies and tools. This is limiting for two reasons: (i) different\nextraction strategies do not ease the interdependence among multimodal\nrecommendation frameworks; thus, they cannot be efficiently and fairly\ncompared; (ii) given the large plethora of pre-trained deep learning models\nmade available by different open source tools, model designers do not have\naccess to shared interfaces to extract features. Motivated by the outlined\naspects, we propose \\framework, a unified framework for the extraction of\nmultimodal features in recommendation. By integrating three widely-adopted deep\nlearning libraries as backends, namely, TensorFlow, PyTorch, and Transformers,\nwe provide a shared interface to extract and process features where each\nbackend's specific methods are abstracted to the end user. Noteworthy, the\nextraction pipeline is easily configurable with a YAML-based file where the\nuser can specify, for each modality, the list of models (and their specific\nbackends/parameters) to perform the extraction. Finally, to make \\framework\naccessible to the community, we build a public Docker image equipped with a\nready-to-use CUDA environment and propose three demos to test its\nfunctionalities for different scenarios and tasks. The GitHub repository and\nthe documentation are accessible at this link:\nhttps://github.com/sisinflab/Ducho.\n","authors":["Daniele Malitesta","Giuseppe Gassi","Claudio Pomo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2306.17125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11963v4","updated":"2023-09-06T04:35:36Z","published":"2023-06-21T01:22:43Z","title":"A Survey of Multimodal Information Fusion for Smart Healthcare: Mapping\n the Journey from Data to Wisdom","summary":" Multimodal medical data fusion has emerged as a transformative approach in\nsmart healthcare, enabling a comprehensive understanding of patient health and\npersonalized treatment plans. In this paper, a journey from data to information\nto knowledge to wisdom (DIKW) is explored through multimodal fusion for smart\nhealthcare. We present a comprehensive review of multimodal medical data fusion\nfocused on the integration of various data modalities. The review explores\ndifferent approaches such as feature selection, rule-based systems, machine\nlearning, deep learning, and natural language processing, for fusing and\nanalyzing multimodal data. This paper also highlights the challenges associated\nwith multimodal fusion in healthcare. By synthesizing the reviewed frameworks\nand theories, it proposes a generic framework for multimodal medical data\nfusion that aligns with the DIKW model. Moreover, it discusses future\ndirections related to the four pillars of healthcare: Predictive, Preventive,\nPersonalized, and Participatory approaches. The components of the comprehensive\nsurvey presented in this paper form the foundation for more successful\nimplementation of multimodal fusion in smart healthcare. Our findings can guide\nresearchers and practitioners in leveraging the power of multimodal fusion with\nthe state-of-the-art approaches to revolutionize healthcare and improve patient\noutcomes.\n","authors":["Thanveer Shaik","Xiaohui Tao","Lin Li","Haoran Xie","Juan D. Velásquez"],"pdf_url":"https://arxiv.org/pdf/2306.11963v4.pdf","comment":"This work has been submitted to the ELSEVIER for possible\n publication. Copyright may be transferred without notice, after which this\n version may no longer be accessible"},{"id":"http://arxiv.org/abs/2309.02064v2","updated":"2023-09-06T04:27:58Z","published":"2023-09-05T09:06:34Z","title":"MvFS: Multi-view Feature Selection for Recommender System","summary":" Feature selection, which is a technique to select key features in recommender\nsystems, has received increasing research attention. Recently, Adaptive Feature\nSelection (AdaFS) has shown remarkable performance by adaptively selecting\nfeatures for each data instance, considering that the importance of a given\nfeature field can vary significantly across data. However, this method still\nhas limitations in that its selection process could be easily biased to major\nfeatures that frequently occur. To address these problems, we propose\nMulti-view Feature Selection (MvFS), which selects informative features for\neach instance more effectively. Most importantly, MvFS employs a multi-view\nnetwork consisting of multiple sub-networks, each of which learns to measure\nthe feature importance of a part of data with different feature patterns. By\ndoing so, MvFS mitigates the bias problem towards dominant patterns and\npromotes a more balanced feature selection process. Moreover, MvFS adopts an\neffective importance score modeling strategy which is applied independently to\neach field without incurring dependency among features. Experimental results on\nreal-world datasets demonstrate the effectiveness of MvFS compared to\nstate-of-the-art baselines.\n","authors":["Youngjune Lee","Yeongjong Jeong","Keunchan Park","SeongKu Kang"],"pdf_url":"https://arxiv.org/pdf/2309.02064v2.pdf","comment":"CIKM 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.03199v1","updated":"2023-09-06T17:59:57Z","published":"2023-09-06T17:59:57Z","title":"Matcha-TTS: A fast TTS architecture with conditional flow matching","summary":" We introduce Matcha-TTS, a new encoder-decoder architecture for speedy TTS\nacoustic modelling, trained using optimal-transport conditional flow matching\n(OT-CFM). This yields an ODE-based decoder capable of high output quality in\nfewer synthesis steps than models trained using score matching. Careful design\nchoices additionally ensure each synthesis step is fast to run. The method is\nprobabilistic, non-autoregressive, and learns to speak from scratch without\nexternal alignments. Compared to strong pre-trained baseline models, the\nMatcha-TTS system has the smallest memory footprint, rivals the speed of the\nfastest models on long utterances, and attains the highest mean opinion score\nin a listening test. Please see https://shivammehta25.github.io/Matcha-TTS/ for\naudio examples, code, and pre-trained models.\n","authors":["Shivam Mehta","Ruibo Tu","Jonas Beskow","Éva Székely","Gustav Eje Henter"],"pdf_url":"https://arxiv.org/pdf/2309.03199v1.pdf","comment":"5 pages, 3 figures. Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.03190v1","updated":"2023-09-06T17:53:31Z","published":"2023-09-06T17:53:31Z","title":"Blink: Link Local Differential Privacy in Graph Neural Networks via\n Bayesian Estimation","summary":" Graph neural networks (GNNs) have gained an increasing amount of popularity\ndue to their superior capability in learning node embeddings for various graph\ninference tasks, but training them can raise privacy concerns. To address this,\nwe propose using link local differential privacy over decentralized nodes,\nenabling collaboration with an untrusted server to train GNNs without revealing\nthe existence of any link. Our approach spends the privacy budget separately on\nlinks and degrees of the graph for the server to better denoise the graph\ntopology using Bayesian estimation, alleviating the negative impact of LDP on\nthe accuracy of the trained GNNs. We bound the mean absolute error of the\ninferred link probabilities against the ground truth graph topology. We then\npropose two variants of our LDP mechanism complementing each other in different\nprivacy settings, one of which estimates fewer links under lower privacy\nbudgets to avoid false positive link estimates when the uncertainty is high,\nwhile the other utilizes more information and performs better given relatively\nhigher privacy budgets. Furthermore, we propose a hybrid variant that combines\nboth strategies and is able to perform better across different privacy budgets.\nExtensive experiments show that our approach outperforms existing methods in\nterms of accuracy under varying privacy budgets.\n","authors":["Xiaochen Zhu","Vincent Y. F. Tan","Xiaokui Xiao"],"pdf_url":"https://arxiv.org/pdf/2309.03190v1.pdf","comment":"17 pages, accepted by ACM CCS 2023 as a conference paper"},{"id":"http://arxiv.org/abs/2303.02131v2","updated":"2023-09-06T17:47:47Z","published":"2023-03-03T18:23:20Z","title":"Spacetime-Efficient Low-Depth Quantum State Preparation with\n Applications","summary":" We propose a novel deterministic method for preparing arbitrary quantum\nstates. When our protocol is compiled into CNOT and arbitrary single-qubit\ngates, it prepares an $N$-dimensional state in depth $O(\\log(N))$ and spacetime\nallocation (a metric that accounts for the fact that oftentimes some ancilla\nqubits need not be active for the entire circuit) $O(N)$, which are both\noptimal. When compiled into the $\\{\\mathrm{H,S,T,CNOT}\\}$ gate set, we show\nthat it requires asymptotically fewer quantum resources than previous methods.\nSpecifically, it prepares an arbitrary state up to error $\\epsilon$ in depth\n$O(\\log(N/\\epsilon))$ and spacetime allocation $O(N\\log(\\log(N)/\\epsilon))$,\nimproving over $O(\\log(N)\\log(N/\\epsilon))$ and $O(N\\log(N/\\epsilon))$,\nrespectively. We illustrate how the reduced spacetime allocation of our\nprotocol enables rapid preparation of many disjoint states with only\nconstant-factor ancilla overhead -- $O(N)$ ancilla qubits are reused\nefficiently to prepare a product state of $w$ $N$-dimensional states in depth\n$O(w + \\log(N))$ rather than $O(w\\log(N))$, achieving effectively constant\ndepth per state. We highlight several applications where this ability would be\nuseful, including quantum machine learning, Hamiltonian simulation, and solving\nlinear systems of equations. We provide quantum circuit descriptions of our\nprotocol, detailed pseudocode, and gate-level implementation examples using\nBraket.\n","authors":["Kaiwen Gui","Alexander M. Dalzell","Alessandro Achille","Martin Suchara","Frederic T. Chong"],"pdf_url":"https://arxiv.org/pdf/2303.02131v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03179v1","updated":"2023-09-06T17:39:05Z","published":"2023-09-06T17:39:05Z","title":"SLiMe: Segment Like Me","summary":" Significant strides have been made using large vision-language models, like\nStable Diffusion (SD), for a variety of downstream tasks, including image\nediting, image correspondence, and 3D shape generation. Inspired by these\nadvancements, we explore leveraging these extensive vision-language models for\nsegmenting images at any desired granularity using as few as one annotated\nsample by proposing SLiMe. SLiMe frames this problem as an optimization task.\nSpecifically, given a single training image and its segmentation mask, we first\nextract attention maps, including our novel \"weighted accumulated\nself-attention map\" from the SD prior. Then, using the extracted attention\nmaps, the text embeddings of Stable Diffusion are optimized such that, each of\nthem, learn about a single segmented region from the training image. These\nlearned embeddings then highlight the segmented region in the attention maps,\nwhich in turn can then be used to derive the segmentation map. This enables\nSLiMe to segment any real-world image during inference with the granularity of\nthe segmented region in the training image, using just one example. Moreover,\nleveraging additional training data when available, i.e. few-shot, improves the\nperformance of SLiMe. We carried out a knowledge-rich set of experiments\nexamining various design factors and showed that SLiMe outperforms other\nexisting one-shot and few-shot segmentation methods.\n","authors":["Aliasghar Khani","Saeid Asgari Taghanaki","Aditya Sanghi","Ali Mahdavi Amiri","Ghassan Hamarneh"],"pdf_url":"https://arxiv.org/pdf/2309.03179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00564v2","updated":"2023-09-06T17:35:10Z","published":"2023-09-01T16:20:04Z","title":"Interpretation of High-Dimensional Linear Regression: Effects of\n Nullspace and Regularization Demonstrated on Battery Data","summary":" High-dimensional linear regression is important in many scientific fields.\nThis article considers discrete measured data of underlying smooth latent\nprocesses, as is often obtained from chemical or biological systems.\nInterpretation in high dimensions is challenging because the nullspace and its\ninterplay with regularization shapes regression coefficients. The data's\nnullspace contains all coefficients that satisfy $\\mathbf{Xw}=\\mathbf{0}$, thus\nallowing very different coefficients to yield identical predictions. We\ndeveloped an optimization formulation to compare regression coefficients and\ncoefficients obtained by physical engineering knowledge to understand which\npart of the coefficient differences are close to the nullspace. This nullspace\nmethod is tested on a synthetic example and lithium-ion battery data. The case\nstudies show that regularization and z-scoring are design choices that, if\nchosen corresponding to prior physical knowledge, lead to interpretable\nregression results. Otherwise, the combination of the nullspace and\nregularization hinders interpretability and can make it impossible to obtain\nregression coefficients close to the true coefficients when there is a true\nunderlying linear model. Furthermore, we demonstrate that regression methods\nthat do not produce coefficients orthogonal to the nullspace, such as fused\nlasso, can improve interpretability. In conclusion, the insights gained from\nthe nullspace perspective help to make informed design choices for building\nregression models on high-dimensional data and reasoning about potential\nunderlying linear models, which are important for system optimization and\nimproving scientific understanding.\n","authors":["Joachim Schaeffer","Eric Lenz","William C. Chueh","Martin Z. Bazant","Rolf Findeisen","Richard D. Braatz"],"pdf_url":"https://arxiv.org/pdf/2309.00564v2.pdf","comment":"Manuscript: 14 pages, 7 figures; Supplementary Information: 4 pages,\n 2 figures; Code available: https://github.com/JoachimSchaeffer/HDRegAnalytics"},{"id":"http://arxiv.org/abs/2309.03177v1","updated":"2023-09-06T17:30:26Z","published":"2023-09-06T17:30:26Z","title":"3D Object Positioning Using Differentiable Multimodal Learning","summary":" This article describes a multi-modal method using simulated Lidar data via\nray tracing and image pixel loss with differentiable rendering to optimize an\nobject's position with respect to an observer or some referential objects in a\ncomputer graphics scene. Object position optimization is completed using\ngradient descent with the loss function being influenced by both modalities.\nTypical object placement optimization is done using image pixel loss with\ndifferentiable rendering only, this work shows the use of a second modality\n(Lidar) leads to faster convergence. This method of fusing sensor input\npresents a potential usefulness for autonomous vehicles, as these methods can\nbe used to establish the locations of multiple actors in a scene. This article\nalso presents a method for the simulation of multiple types of data to be used\nin the training of autonomous vehicles.\n","authors":["Sean Zanyk-McLean","Krishna Kumar","Paul Navratil"],"pdf_url":"https://arxiv.org/pdf/2309.03177v1.pdf","comment":"7 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.03079v1","updated":"2023-09-06T17:18:55Z","published":"2023-09-06T17:18:55Z","title":"GPT-InvestAR: Enhancing Stock Investment Strategies through Annual\n Report Analysis with Large Language Models","summary":" Annual Reports of publicly listed companies contain vital information about\ntheir financial health which can help assess the potential impact on Stock\nprice of the firm. These reports are comprehensive in nature, going up to, and\nsometimes exceeding, 100 pages. Analysing these reports is cumbersome even for\na single firm, let alone the whole universe of firms that exist. Over the\nyears, financial experts have become proficient in extracting valuable\ninformation from these documents relatively quickly. However, this requires\nyears of practice and experience. This paper aims to simplify the process of\nassessing Annual Reports of all the firms by leveraging the capabilities of\nLarge Language Models (LLMs). The insights generated by the LLM are compiled in\na Quant styled dataset and augmented by historical stock price data. A Machine\nLearning model is then trained with LLM outputs as features. The walkforward\ntest results show promising outperformance wrt S&P500 returns. This paper\nintends to provide a framework for future work in this direction. To facilitate\nthis, the code has been released as open source.\n","authors":["Udit Gupta"],"pdf_url":"https://arxiv.org/pdf/2309.03079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03169v1","updated":"2023-09-06T17:09:43Z","published":"2023-09-06T17:09:43Z","title":"Impression-Informed Multi-Behavior Recommender System: A Hierarchical\n Graph Attention Approach","summary":" While recommender systems have significantly benefited from implicit\nfeedback, they have often missed the nuances of multi-behavior interactions\nbetween users and items. Historically, these systems either amalgamated all\nbehaviors, such as \\textit{impression} (formerly \\textit{view}),\n\\textit{add-to-cart}, and \\textit{buy}, under a singular 'interaction' label,\nor prioritized only the target behavior, often the \\textit{buy} action,\ndiscarding valuable auxiliary signals. Although recent advancements tried\naddressing this simplification, they primarily gravitated towards optimizing\nthe target behavior alone, battling with data scarcity. Additionally, they\ntended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these\ngaps, we introduce the \\textbf{H}ierarchical \\textbf{M}ulti-behavior\n\\textbf{G}raph Attention \\textbf{N}etwork (HMGN). This pioneering framework\nleverages attention mechanisms to discern information from both inter and\nintra-behaviors while employing a multi-task Hierarchical Bayesian Personalized\nRanking (HBPR) for optimization. Recognizing the need for scalability, our\napproach integrates a specialized multi-behavior sub-graph sampling technique.\nMoreover, the adaptability of HMGN allows for the seamless inclusion of\nknowledge metadata and time-series data. Empirical results attest to our\nmodel's prowess, registering a notable performance boost of up to 64\\% in\nNDCG@100 metrics over conventional graph neural network methods.\n","authors":["Dong Li","Divya Bhargavi","Vidya Sagar Ravipati"],"pdf_url":"https://arxiv.org/pdf/2309.03169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03167v1","updated":"2023-09-06T17:08:57Z","published":"2023-09-06T17:08:57Z","title":"Split-Boost Neural Networks","summary":" The calibration and training of a neural network is a complex and\ntime-consuming procedure that requires significant computational resources to\nachieve satisfactory results. Key obstacles are a large number of\nhyperparameters to select and the onset of overfitting in the face of a small\namount of data. In this framework, we propose an innovative training strategy\nfor feed-forward architectures - called split-boost - that improves performance\nand automatically includes a regularizing behaviour without modeling it\nexplicitly. Such a novel approach ultimately allows us to avoid explicitly\nmodeling the regularization term, decreasing the total number of\nhyperparameters and speeding up the tuning phase. The proposed strategy is\ntested on a real-world (anonymized) dataset within a benchmark medical\ninsurance design problem.\n","authors":["Raffaele Giuseppe Cestari","Gabriele Maroni","Loris Cannelli","Dario Piga","Simone Formentin"],"pdf_url":"https://arxiv.org/pdf/2309.03167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03157v1","updated":"2023-09-06T16:55:11Z","published":"2023-09-06T16:55:11Z","title":"Learning to Recharge: UAV Coverage Path Planning through Deep\n Reinforcement Learning","summary":" Coverage path planning (CPP) is a critical problem in robotics, where the\ngoal is to find an efficient path that covers every point in an area of\ninterest. This work addresses the power-constrained CPP problem with recharge\nfor battery-limited unmanned aerial vehicles (UAVs). In this problem, a notable\nchallenge emerges from integrating recharge journeys into the overall coverage\nstrategy, highlighting the intricate task of making strategic, long-term\ndecisions. We propose a novel proximal policy optimization (PPO)-based deep\nreinforcement learning (DRL) approach with map-based observations, utilizing\naction masking and discount factor scheduling to optimize coverage trajectories\nover the entire mission horizon. We further provide the agent with a position\nhistory to handle emergent state loops caused by the recharge capability. Our\napproach outperforms a baseline heuristic, generalizes to different target\nzones and maps, with limited generalization to unseen maps. We offer valuable\ninsights into DRL algorithm design for long-horizon problems and provide a\npublicly available software framework for the CPP problem.\n","authors":["Mirco Theile","Harald Bayerlein","Marco Caccamo","Alberto L. Sangiovanni-Vincentelli"],"pdf_url":"https://arxiv.org/pdf/2309.03157v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2307.02694v2","updated":"2023-09-06T16:53:24Z","published":"2023-07-05T23:53:55Z","title":"Loss Functions and Metrics in Deep Learning","summary":" One of the essential components of deep learning is the choice of the loss\nfunction and performance metrics used to train and evaluate models. This paper\nreviews the most prevalent loss functions and performance measurements in deep\nlearning. We examine the benefits and limits of each technique and illustrate\ntheir application to various deep-learning problems. Our review aims to give a\ncomprehensive picture of the different loss functions and performance\nindicators used in the most common deep learning tasks and help practitioners\nchoose the best method for their specific task.\n","authors":["Juan Terven","Diana M. Cordova-Esparza","Alfonso Ramirez-Pedraza","Edgar A. Chavez-Urbiola"],"pdf_url":"https://arxiv.org/pdf/2307.02694v2.pdf","comment":"53 pages, 5 figures, 7 tables, 86 equations"},{"id":"http://arxiv.org/abs/2211.13398v2","updated":"2023-09-06T16:47:31Z","published":"2022-11-24T03:27:00Z","title":"CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote\n Aggregation","summary":" Object pose estimation constitutes a critical area within the domain of 3D\nvision. While contemporary state-of-the-art methods that leverage real-world\npose annotations have demonstrated commendable performance, the procurement of\nsuch real-world training data incurs substantial costs. This paper focuses on a\nspecific setting wherein only 3D CAD models are utilized as a priori knowledge,\ndevoid of any background or clutter information. We introduce a novel method,\nCPPF++, designed for sim-to-real pose estimation. This method builds upon the\nfoundational point-pair voting scheme of CPPF, reconceptualizing it through a\nprobabilistic lens. To address the challenge of voting collision, we model\nvoting uncertainty by estimating the probabilistic distribution of each point\npair within the canonical space. This approach is further augmented by\niterative noise filtering, employed to eradicate votes associated with\nbackgrounds or clutters. Additionally, we enhance the context provided by each\nvoting unit by introducing $N$-point tuples. In conjunction with this\nmethodological contribution, we present a new category-level pose estimation\ndataset, DiversePose 300. This dataset is specifically crafted to facilitate a\nmore rigorous evaluation of current state-of-the-art methods, encompassing a\nbroader and more challenging array of real-world scenarios. Empirical results\nsubstantiate the efficacy of our proposed method, revealing a significant\nreduction in the disparity between simulation and real-world performance.\n","authors":["Yang You","Wenhao He","Jin Liu","Hongkai Xiong","Weiming Wang","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2211.13398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03148v1","updated":"2023-09-06T16:44:08Z","published":"2023-09-06T16:44:08Z","title":"Data-Driven Neural Polar Codes for Unknown Channels With and Without\n Memory","summary":" In this work, a novel data-driven methodology for designing polar codes for\nchannels with and without memory is proposed. The methodology is suitable for\nthe case where the channel is given as a \"black-box\" and the designer has\naccess to the channel for generating observations of its inputs and outputs,\nbut does not have access to the explicit channel model. The proposed method\nleverages the structure of the successive cancellation (SC) decoder to devise a\nneural SC (NSC) decoder. The NSC decoder uses neural networks (NNs) to replace\nthe core elements of the original SC decoder, the check-node, the bit-node and\nthe soft decision. Along with the NSC, we devise additional NN that embeds the\nchannel outputs into the input space of the SC decoder. The proposed method is\nsupported by theoretical guarantees that include the consistency of the NSC.\nAlso, the NSC has computational complexity that does not grow with the channel\nmemory size. This sets its main advantage over successive cancellation trellis\n(SCT) decoder for finite state channels (FSCs) that has complexity of\n$O(|\\mathcal{S}|^3 N\\log N)$, where $|\\mathcal{S}|$ denotes the number of\nchannel states. We demonstrate the performance of the proposed algorithms on\nmemoryless channels and on channels with memory. The empirical results are\ncompared with the optimal polar decoder, given by the SC and SCT decoders. We\nfurther show that our algorithms are applicable for the case where there SC and\nSCT decoders are not applicable.\n","authors":["Ziv Aharoni","Bashar Huleihel","Henry D. Pfister","Haim H. Permuter"],"pdf_url":"https://arxiv.org/pdf/2309.03148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03145v1","updated":"2023-09-06T16:41:41Z","published":"2023-09-06T16:41:41Z","title":"The Best Arm Evades: Near-optimal Multi-pass Streaming Lower Bounds for\n Pure Exploration in Multi-armed Bandits","summary":" We give a near-optimal sample-pass trade-off for pure exploration in\nmulti-armed bandits (MABs) via multi-pass streaming algorithms: any streaming\nalgorithm with sublinear memory that uses the optimal sample complexity of\n$O(\\frac{n}{\\Delta^2})$ requires\n$\\Omega(\\frac{\\log{(1/\\Delta)}}{\\log\\log{(1/\\Delta)}})$ passes. Here, $n$ is\nthe number of arms and $\\Delta$ is the reward gap between the best and the\nsecond-best arms. Our result matches the $O(\\log(\\frac{1}{\\Delta}))$-pass\nalgorithm of Jin et al. [ICML'21] (up to lower order terms) that only uses\n$O(1)$ memory and answers an open question posed by Assadi and Wang [STOC'20].\n","authors":["Sepehr Assadi","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03139v1","updated":"2023-09-06T16:24:26Z","published":"2023-09-06T16:24:26Z","title":"Using Multiple Vector Channels Improves E(n)-Equivariant Graph Neural\n Networks","summary":" We present a natural extension to E(n)-equivariant graph neural networks that\nuses multiple equivariant vectors per node. We formulate the extension and show\nthat it improves performance across different physical systems benchmark tasks,\nwith minimal differences in runtime or number of parameters. The proposed\nmultichannel EGNN outperforms the standard singlechannel EGNN on N-body charged\nparticle dynamics, molecular property predictions, and predicting the\ntrajectories of solar system bodies. Given the additional benefits and minimal\nadditional cost of multi-channel EGNN, we suggest that this extension may be of\npractical use to researchers working in machine learning for the physical\nsciences\n","authors":["Daniel Levy","Sékou-Oumar Kaba","Carmelo Gonzales","Santiago Miret","Siamak Ravanbakhsh"],"pdf_url":"https://arxiv.org/pdf/2309.03139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03113v1","updated":"2023-09-06T15:52:55Z","published":"2023-09-06T15:52:55Z","title":"Detecting Manufacturing Defects in PCBs via Data-Centric Machine\n Learning on Solder Paste Inspection Features","summary":" Automated detection of defects in Printed Circuit Board (PCB) manufacturing\nusing Solder Paste Inspection (SPI) and Automated Optical Inspection (AOI)\nmachines can help improve operational efficiency and significantly reduce the\nneed for manual intervention. In this paper, using SPI-extracted features of 6\nmillion pins, we demonstrate a data-centric approach to train Machine Learning\n(ML) models to detect PCB defects at three stages of PCB manufacturing. The 6\nmillion PCB pins correspond to 2 million components that belong to 15,387 PCBs.\nUsing a base extreme gradient boosting (XGBoost) ML model, we iterate on the\ndata pre-processing step to improve detection performance. Combining pin-level\nSPI features using component and PCB IDs, we developed training instances also\nat the component and PCB level. This allows the ML model to capture any\ninter-pin, inter-component, or spatial effects that may not be apparent at the\npin level. Models are trained at the pin, component, and PCB levels, and the\ndetection results from the different models are combined to identify defective\ncomponents.\n","authors":["Jubilee Prasad-Rao","Roohollah Heidary","Jesse Williams"],"pdf_url":"https://arxiv.org/pdf/2309.03113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03859v2","updated":"2023-09-06T15:42:40Z","published":"2023-05-05T22:04:00Z","title":"Open problems in causal structure learning: A case study of COVID-19 in\n the UK","summary":" Causal machine learning (ML) algorithms recover graphical structures that\ntell us something about cause-and-effect relationships. The causal\nrepresentation praovided by these algorithms enables transparency and\nexplainability, which is necessary for decision making in critical real-world\nproblems. Yet, causal ML has had limited impact in practice compared to\nassociational ML. This paper investigates the challenges of causal ML with\napplication to COVID-19 UK pandemic data. We collate data from various public\nsources and investigate what the various structure learning algorithms learn\nfrom these data. We explore the impact of different data formats on algorithms\nspanning different classes of learning, and assess the results produced by each\nalgorithm, and groups of algorithms, in terms of graphical structure, model\ndimensionality, sensitivity analysis, confounding variables, predictive and\ninterventional inference. We use these results to highlight open problems in\ncausal structure learning and directions for future research. To facilitate\nfuture work, we make all graphs, models, data sets, and source code publicly\navailable online.\n","authors":["Anthony Constantinou","Neville K. Kitson","Yang Liu","Kiattikun Chobtham","Arian Hashemzadeh","Praharsh A. Nanavati","Rendani Mbuvha","Bruno Petrungaro"],"pdf_url":"https://arxiv.org/pdf/2305.03859v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03103v1","updated":"2023-09-06T15:41:38Z","published":"2023-09-06T15:41:38Z","title":"ContrastWSD: Enhancing Metaphor Detection with Word Sense Disambiguation\n Following the Metaphor Identification Procedure","summary":" This paper presents ContrastWSD, a RoBERTa-based metaphor detection model\nthat integrates the Metaphor Identification Procedure (MIP) and Word Sense\nDisambiguation (WSD) to extract and contrast the contextual meaning with the\nbasic meaning of a word to determine whether it is used metaphorically in a\nsentence. By utilizing the word senses derived from a WSD model, our model\nenhances the metaphor detection process and outperforms other methods that rely\nsolely on contextual embeddings or integrate only the basic definitions and\nother external knowledge. We evaluate our approach on various benchmark\ndatasets and compare it with strong baselines, indicating the effectiveness in\nadvancing metaphor detection.\n","authors":["Mohamad Elzohbi","Richard Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.03103v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2211.06919v2","updated":"2023-09-06T15:32:58Z","published":"2022-11-13T14:54:42Z","title":"Towards Privacy-Aware Causal Structure Learning in Federated Setting","summary":" Causal structure learning has been extensively studied and widely used in\nmachine learning and various applications. To achieve an ideal performance,\nexisting causal structure learning algorithms often need to centralize a large\namount of data from multiple data sources. However, in the privacy-preserving\nsetting, it is impossible to centralize data from all sources and put them\ntogether as a single dataset. To preserve data privacy, federated learning as a\nnew learning paradigm has attracted much attention in machine learning in\nrecent years. In this paper, we study a privacy-aware causal structure learning\nproblem in the federated setting and propose a novel Federated PC (FedPC)\nalgorithm with two new strategies for preserving data privacy without\ncentralizing data. Specifically, we first propose a novel layer-wise\naggregation strategy for a seamless adaptation of the PC algorithm into the\nfederated learning paradigm for federated skeleton learning, then we design an\neffective strategy for learning consistent separation sets for federated edge\norientation. The extensive experiments validate that FedPC is effective for\ncausal structure learning in a federated learning setting.\n","authors":["Jianli Huang","Xianjie Guo","Kui Yu","Fuyuan Cao","Jiye Liang"],"pdf_url":"https://arxiv.org/pdf/2211.06919v2.pdf","comment":"This paper has been accepted by the journal IEEE Transactions on Big\n Data, and it contains 21 pages, 9 figures and 15 tables"},{"id":"http://arxiv.org/abs/2309.03081v1","updated":"2023-09-06T15:28:43Z","published":"2023-09-06T15:28:43Z","title":"ORL-AUDITOR: Dataset Auditing in Offline Deep Reinforcement Learning","summary":" Data is a critical asset in AI, as high-quality datasets can significantly\nimprove the performance of machine learning models. In safety-critical domains\nsuch as autonomous vehicles, offline deep reinforcement learning (offline DRL)\nis frequently used to train models on pre-collected datasets, as opposed to\ntraining these models by interacting with the real-world environment as the\nonline DRL. To support the development of these models, many institutions make\ndatasets publicly available with opensource licenses, but these datasets are at\nrisk of potential misuse or infringement. Injecting watermarks to the dataset\nmay protect the intellectual property of the data, but it cannot handle\ndatasets that have already been published and is infeasible to be altered\nafterward. Other existing solutions, such as dataset inference and membership\ninference, do not work well in the offline DRL scenario due to the diverse\nmodel behavior characteristics and offline setting constraints. In this paper,\nwe advocate a new paradigm by leveraging the fact that cumulative rewards can\nact as a unique identifier that distinguishes DRL models trained on a specific\ndataset. To this end, we propose ORL-AUDITOR, which is the first\ntrajectory-level dataset auditing mechanism for offline RL scenarios. Our\nexperiments on multiple offline DRL models and tasks reveal the efficacy of\nORL-AUDITOR, with auditing accuracy over 95% and false positive rates less than\n2.88%. We also provide valuable insights into the practical implementation of\nORL-AUDITOR by studying various parameter settings. Furthermore, we demonstrate\nthe auditing capability of ORL-AUDITOR on open-source datasets from Google and\nDeepMind, highlighting its effectiveness in auditing published datasets.\nORL-AUDITOR is open-sourced at https://github.com/link-zju/ORL-Auditor.\n","authors":["Linkang Du","Min Chen","Mingyang Sun","Shouling Ji","Peng Cheng","Jiming Chen","Zhikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.03081v1.pdf","comment":"To appear in the Network and Distributed System Security Symposium\n (NDSS) 2024, San Diego, CA, USA"},{"id":"http://arxiv.org/abs/2309.03075v1","updated":"2023-09-06T15:22:33Z","published":"2023-09-06T15:22:33Z","title":"Parameterizing pressure-temperature profiles of exoplanet atmospheres\n with neural networks","summary":" Atmospheric retrievals (AR) of exoplanets typically rely on a combination of\na Bayesian inference technique and a forward simulator to estimate atmospheric\nproperties from an observed spectrum. A key component in simulating spectra is\nthe pressure-temperature (PT) profile, which describes the thermal structure of\nthe atmosphere. Current AR pipelines commonly use ad hoc fitting functions here\nthat limit the retrieved PT profiles to simple approximations, but still use a\nrelatively large number of parameters. In this work, we introduce a\nconceptually new, data-driven parameterization scheme for physically consistent\nPT profiles that does not require explicit assumptions about the functional\nform of the PT profiles and uses fewer parameters than existing methods. Our\napproach consists of a latent variable model (based on a neural network) that\nlearns a distribution over functions (PT profiles). Each profile is represented\nby a low-dimensional vector that can be used to condition a decoder network\nthat maps $P$ to $T$. When training and evaluating our method on two publicly\navailable datasets of self-consistent PT profiles, we find that our method\nachieves, on average, better fit quality than existing baseline methods,\ndespite using fewer parameters. In an AR based on existing literature, our\nmodel (using two parameters) produces a tighter, more accurate posterior for\nthe PT profile than the five-parameter polynomial baseline, while also speeding\nup the retrieval by more than a factor of three. By providing parametric access\nto physically consistent PT profiles, and by reducing the number of parameters\nrequired to describe a PT profile (thereby reducing computational cost or\nfreeing resources for additional parameters of interest), our method can help\nimprove AR and thus our understanding of exoplanet atmospheres and their\nhabitability.\n","authors":["Timothy D. Gebhard","Daniel Angerhausen","Björn S. Konrad","Eleonora Alei","Sascha P. Quanz","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2309.03075v1.pdf","comment":"Accepted for publication in Astronomy & Astrophysics"},{"id":"http://arxiv.org/abs/2309.03072v1","updated":"2023-09-06T15:19:04Z","published":"2023-09-06T15:19:04Z","title":"Character Queries: A Transformer-based Approach to On-Line Handwritten\n Character Segmentation","summary":" On-line handwritten character segmentation is often associated with\nhandwriting recognition and even though recognition models include mechanisms\nto locate relevant positions during the recognition process, it is typically\ninsufficient to produce a precise segmentation. Decoupling the segmentation\nfrom the recognition unlocks the potential to further utilize the result of the\nrecognition. We specifically focus on the scenario where the transcription is\nknown beforehand, in which case the character segmentation becomes an\nassignment problem between sampling points of the stylus trajectory and\ncharacters in the text. Inspired by the $k$-means clustering algorithm, we view\nit from the perspective of cluster assignment and present a Transformer-based\narchitecture where each cluster is formed based on a learned character query in\nthe Transformer decoder block. In order to assess the quality of our approach,\nwe create character segmentation ground truths for two popular on-line\nhandwriting datasets, IAM-OnDB and HANDS-VNOnDB, and evaluate multiple methods\non them, demonstrating that our approach achieves the overall best results.\n","authors":["Michael Jungo","Beat Wolf","Andrii Maksai","Claudiu Musat","Andreas Fischer"],"pdf_url":"https://arxiv.org/pdf/2309.03072v1.pdf","comment":"ICDAR 2023 Best Student Paper Award. Code available at\n https://github.com/jungomi/character-queries"},{"id":"http://arxiv.org/abs/2308.14781v3","updated":"2023-09-06T15:15:03Z","published":"2023-08-28T15:50:34Z","title":"Conflict-Aware Active Automata Learning (Extended Version)","summary":" Active automata learning algorithms cannot easily handle conflict in the\nobservation data (different outputs observed for the same inputs). This\ninherent inability to recover after a conflict impairs their effective\napplicability in scenarios where noise is present or the system under learning\nis mutating.\n We propose the Conflict-Aware Active Automata Learning (C3AL) framework to\nenable handling conflicting information during the learning process. The core\nidea is to consider the so-called observation tree as a first-class citizen in\nthe learning process. Though this idea is explored in recent work, we take it\nto its full effect by enabling its use with any existing learner and minimizing\nthe number of tests performed on the system under learning, specially in the\nface of conflicts. We evaluate C3AL in a large set of benchmarks, covering over\n30 different realistic targets, and over 18,000 different scenarios. The\nresults of the evaluation show that C3AL is a suitable alternative framework\nfor closed-box learning that can better handle noise and mutations.\n","authors":["Tiago Ferreira","Léo Henry","Raquel Fernandes da Silva","Alexandra Silva"],"pdf_url":"https://arxiv.org/pdf/2308.14781v3.pdf","comment":"37 pages, 11 figures, GandALF 2023"},{"id":"http://arxiv.org/abs/2309.01507v2","updated":"2023-09-06T15:06:46Z","published":"2023-09-04T10:27:17Z","title":"Memory Efficient Optimizers with 4-bit States","summary":" Optimizer states are a major source of memory consumption for training neural\nnetworks, limiting the maximum trainable model within given memory budget.\nCompressing the optimizer states from 32-bit floating points to lower bitwidth\nis promising to reduce the training memory footprint, while the current lowest\nachievable bitwidth is 8-bit. In this work, we push optimizer states bitwidth\ndown to 4-bit through a detailed empirical analysis of first and second\nmoments. Specifically, we find that moments have complicated outlier patterns,\nthat current block-wise quantization cannot accurately approximate. We use a\nsmaller block size and propose to utilize both row-wise and column-wise\ninformation for better quantization. We further identify a zero point problem\nof quantizing the second moment, and solve this problem with a linear quantizer\nthat excludes the zero point. Our 4-bit optimizer is evaluated on a wide\nvariety of benchmarks including natural language understanding, machine\ntranslation, image classification, and instruction tuning. On all the tasks our\noptimizers can achieve comparable accuracy with their full-precision\ncounterparts, while enjoying better memory efficiency.\n","authors":["Bingrui Li","Jianfei Chen","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.01507v2.pdf","comment":"35 pages"},{"id":"http://arxiv.org/abs/2212.05037v2","updated":"2023-09-06T15:03:54Z","published":"2022-12-01T15:43:20Z","title":"A Topological Deep Learning Framework for Neural Spike Decoding","summary":" The brain's spatial orientation system uses different neuron ensembles to aid\nin environment-based navigation. Two of the ways brains encode spatial\ninformation is through head direction cells and grid cells. Brains use head\ndirection cells to determine orientation whereas grid cells consist of layers\nof decked neurons that overlay to provide environment-based navigation. These\nneurons fire in ensembles where several neurons fire at once to activate a\nsingle head direction or grid. We want to capture this firing structure and use\nit to decode head direction grid cell data. Understanding, representing, and\ndecoding these neural structures requires models that encompass higher order\nconnectivity, more than the 1-dimensional connectivity that traditional\ngraph-based models provide. To that end, in this work, we develop a topological\ndeep learning framework for neural spike train decoding. Our framework combines\nunsupervised simplicial complex discovery with the power of deep learning via a\nnew architecture we develop herein called a simplicial convolutional recurrent\nneural network. Simplicial complexes, topological spaces that use not only\nvertices and edges but also higher-dimensional objects, naturally generalize\ngraphs and capture more than just pairwise relationships. Additionally, this\napproach does not require prior knowledge of the neural activity beyond spike\ncounts, which removes the need for similarity measurements. The effectiveness\nand versatility of the simplicial convolutional neural network is demonstrated\non head direction and trajectory prediction via head direction and grid cell\ndatasets.\n","authors":["Edward C. Mitchell","Brittany Story","David Boothe","Piotr J. Franaszczuk","Vasileios Maroulas"],"pdf_url":"https://arxiv.org/pdf/2212.05037v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03061v1","updated":"2023-09-06T15:00:36Z","published":"2023-09-06T15:00:36Z","title":"Learning Active Subspaces for Effective and Scalable Uncertainty\n Quantification in Deep Neural Networks","summary":" Bayesian inference for neural networks, or Bayesian deep learning, has the\npotential to provide well-calibrated predictions with quantified uncertainty\nand robustness. However, the main hurdle for Bayesian deep learning is its\ncomputational complexity due to the high dimensionality of the parameter space.\nIn this work, we propose a novel scheme that addresses this limitation by\nconstructing a low-dimensional subspace of the neural network\nparameters-referred to as an active subspace-by identifying the parameter\ndirections that have the most significant influence on the output of the neural\nnetwork. We demonstrate that the significantly reduced active subspace enables\neffective and scalable Bayesian inference via either Monte Carlo (MC) sampling\nmethods, otherwise computationally intractable, or variational inference.\nEmpirically, our approach provides reliable predictions with robust uncertainty\nestimates for various regression tasks.\n","authors":["Sanket Jantre","Nathan M. Urban","Xiaoning Qian","Byung-Jun Yoon"],"pdf_url":"https://arxiv.org/pdf/2309.03061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03060v1","updated":"2023-09-06T14:59:38Z","published":"2023-09-06T14:59:38Z","title":"CoLA: Exploiting Compositional Structure for Automatic and Efficient\n Numerical Linear Algebra","summary":" Many areas of machine learning and science involve large linear algebra\nproblems, such as eigendecompositions, solving linear systems, computing matrix\nexponentials, and trace estimation. The matrices involved often have Kronecker,\nconvolutional, block diagonal, sum, or product structure. In this paper, we\npropose a simple but general framework for large-scale linear algebra problems\nin machine learning, named CoLA (Compositional Linear Algebra). By combining a\nlinear operator abstraction with compositional dispatch rules, CoLA\nautomatically constructs memory and runtime efficient numerical algorithms.\nMoreover, CoLA provides memory efficient automatic differentiation, low\nprecision computation, and GPU acceleration in both JAX and PyTorch, while also\naccommodating new objects, operations, and rules in downstream packages via\nmultiple dispatch. CoLA can accelerate many algebraic operations, while making\nit easy to prototype matrix structures and algorithms, providing an appealing\ndrop-in tool for virtually any computational effort that requires linear\nalgebra. We showcase its efficacy across a broad range of applications,\nincluding partial differential equations, Gaussian processes, equivariant model\nconstruction, and unsupervised learning.\n","authors":["Andres Potapczynski","Marc Finzi","Geoff Pleiss","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2309.03060v1.pdf","comment":"Code available at https://github.com/wilson-labs/cola"},{"id":"http://arxiv.org/abs/2302.11962v2","updated":"2023-09-06T14:38:35Z","published":"2023-02-23T12:18:28Z","title":"Unified Convergence Theory of Stochastic and Variance-Reduced Cubic\n Newton Methods","summary":" We study stochastic Cubic Newton methods for solving general possibly\nnon-convex minimization problems. We propose a new framework, which we call the\nhelper framework, that provides a unified view of the stochastic and\nvariance-reduced second-order algorithms equipped with global complexity\nguarantees. It can also be applied to learning with auxiliary information. Our\nhelper framework offers the algorithm designer high flexibility for\nconstructing and analyzing the stochastic Cubic Newton methods, allowing\narbitrary size batches, and the use of noisy and possibly biased estimates of\nthe gradients and Hessians, incorporating both the variance reduction and the\nlazy Hessian updates. We recover the best-known complexities for the stochastic\nand variance-reduced Cubic Newton, under weak assumptions on the noise. A\ndirect consequence of our theory is the new lazy stochastic second-order\nmethod, which significantly improves the arithmetic complexity for large\ndimension problems. We also establish complexity bounds for the classes of\ngradient-dominated objectives, that include convex and strongly convex\nproblems. For Auxiliary Learning, we show that using a helper (auxiliary\nfunction) can outperform training alone if a given similarity measure is small.\n","authors":["El Mahdi Chayti","Nikita Doikov","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2302.11962v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03040v1","updated":"2023-09-06T14:34:03Z","published":"2023-09-06T14:34:03Z","title":"Automated CVE Analysis for Threat Prioritization and Impact Prediction","summary":" The Common Vulnerabilities and Exposures (CVE) are pivotal information for\nproactive cybersecurity measures, including service patching, security\nhardening, and more. However, CVEs typically offer low-level, product-oriented\ndescriptions of publicly disclosed cybersecurity vulnerabilities, often lacking\nthe essential attack semantic information required for comprehensive weakness\ncharacterization and threat impact estimation. This critical insight is\nessential for CVE prioritization and the identification of potential\ncountermeasures, particularly when dealing with a large number of CVEs. Current\nindustry practices involve manual evaluation of CVEs to assess their attack\nseverities using the Common Vulnerability Scoring System (CVSS) and mapping\nthem to Common Weakness Enumeration (CWE) for potential mitigation\nidentification. Unfortunately, this manual analysis presents a major bottleneck\nin the vulnerability analysis process, leading to slowdowns in proactive\ncybersecurity efforts and the potential for inaccuracies due to human errors.\nIn this research, we introduce our novel predictive model and tool (called\nCVEDrill) which revolutionizes CVE analysis and threat prioritization. CVEDrill\naccurately estimates the CVSS vector for precise threat mitigation and priority\nranking and seamlessly automates the classification of CVEs into the\nappropriate CWE hierarchy classes. By harnessing CVEDrill, organizations can\nnow implement cybersecurity countermeasure mitigation with unparalleled\naccuracy and timeliness, surpassing in this domain the capabilities of\nstate-of-the-art tools like ChaptGPT.\n","authors":["Ehsan Aghaei","Ehab Al-Shaer","Waseem Shadid","Xi Niu"],"pdf_url":"https://arxiv.org/pdf/2309.03040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10807v2","updated":"2023-09-06T14:27:17Z","published":"2023-08-21T15:56:05Z","title":"DynED: Dynamic Ensemble Diversification in Data Stream Classification","summary":" Ensemble methods are commonly used in classification due to their remarkable\nperformance. Achieving high accuracy in a data stream environment is a\nchallenging task considering disruptive changes in the data distribution, also\nknown as concept drift. A greater diversity of ensemble components is known to\nenhance prediction accuracy in such settings. Despite the diversity of\ncomponents within an ensemble, not all contribute as expected to its overall\nperformance. This necessitates a method for selecting components that exhibit\nhigh performance and diversity. We present a novel ensemble construction and\nmaintenance approach based on MMR (Maximal Marginal Relevance) that dynamically\ncombines the diversity and prediction accuracy of components during the process\nof structuring an ensemble. The experimental results on both four real and 11\nsynthetic datasets demonstrate that the proposed approach (DynED) provides a\nhigher average mean accuracy compared to the five state-of-the-art baselines.\n","authors":["Soheil Abadifard","Sepehr Bakhshi","Sanaz Gheibuni","Fazli Can"],"pdf_url":"https://arxiv.org/pdf/2308.10807v2.pdf","comment":"Proceedings of the 32nd ACM International Conference on Information\n and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United\n Kingdom"},{"id":"http://arxiv.org/abs/2309.03033v1","updated":"2023-09-06T14:22:24Z","published":"2023-09-06T14:22:24Z","title":"Deep Learning for Polycystic Kidney Disease: Utilizing Neural Networks\n for Accurate and Early Detection through Gene Expression Analysis","summary":" With Polycystic Kidney Disease (PKD) potentially leading to fatal\ncomplications in patients due to the formation of cysts in the kidneys, early\ndetection of PKD is crucial for effective management of the condition. However,\nthe various patient-specific factors that play a role in the diagnosis make it\nan intricate puzzle for clinicians to solve. Therefore, in this study, we aim\nto utilize a deep learning-based approach for early disease detection. The\ndevised neural network can achieve accurate and robust predictions for possible\nPKD in patients by analyzing patient gene expressions.\n","authors":["Kapil Panda","Anirudh Mazumder"],"pdf_url":"https://arxiv.org/pdf/2309.03033v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.10293v2","updated":"2023-09-06T14:21:45Z","published":"2023-05-17T15:27:35Z","title":"Infinite Class Mixup","summary":" Mixup is a widely adopted strategy for training deep networks, where\nadditional samples are augmented by interpolating inputs and labels of training\npairs. Mixup has shown to improve classification performance, network\ncalibration, and out-of-distribution generalisation. While effective, a\ncornerstone of Mixup, namely that networks learn linear behaviour patterns\nbetween classes, is only indirectly enforced since the output interpolation is\nperformed at the probability level. This paper seeks to address this limitation\nby mixing the classifiers directly instead of mixing the labels for each mixed\npair. We propose to define the target of each augmented sample as a uniquely\nnew classifier, whose parameters are a linear interpolation of the classifier\nvectors of the input pair. The space of all possible classifiers is continuous\nand spans all interpolations between classifier pairs. To make optimisation\ntractable, we propose a dual-contrastive Infinite Class Mixup loss, where we\ncontrast the classifier of a mixed pair to both the classifiers and the\npredicted outputs of other mixed pairs in a batch. Infinite Class Mixup is\ngeneric in nature and applies to many variants of Mixup. Empirically, we show\nthat it outperforms standard Mixup and variants such as RegMixup and Remix on\nbalanced, long-tailed, and data-constrained benchmarks, highlighting its broad\napplicability.\n","authors":["Thomas Mensink","Pascal Mettes"],"pdf_url":"https://arxiv.org/pdf/2305.10293v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2209.12651v5","updated":"2023-09-06T14:21:12Z","published":"2022-09-26T12:53:23Z","title":"Learning Variational Models with Unrolling and Bilevel Optimization","summary":" In this paper we consider the problem of learning variational models in the\ncontext of supervised learning via risk minimization. Our goal is to provide a\ndeeper understanding of the two approaches of learning of variational models\nvia bilevel optimization and via algorithm unrolling. The former considers the\nvariational model as a lower level optimization problem below the risk\nminimization problem, while the latter replaces the lower level optimization\nproblem by an algorithm that solves said problem approximately. Both approaches\nare used in practice, but unrolling is much simpler from a computational point\nof view. To analyze and compare the two approaches, we consider a simple toy\nmodel, and compute all risks and the respective estimators explicitly. We show\nthat unrolling can be better than the bilevel optimization approach, but also\nthat the performance of unrolling can depend significantly on further\nparameters, sometimes in unexpected ways: While the stepsize of the unrolled\nalgorithm matters a lot (and learning the stepsize gives a significant\nimprovement), the number of unrolled iterations plays a minor role.\n","authors":["Christoph Brauer","Niklas Breustedt","Timo de Wolff","Dirk A. Lorenz"],"pdf_url":"https://arxiv.org/pdf/2209.12651v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03023v1","updated":"2023-09-06T14:08:46Z","published":"2023-09-06T14:08:46Z","title":"Universal Preprocessing Operators for Embedding Knowledge Graphs with\n Literals","summary":" Knowledge graph embeddings are dense numerical representations of entities in\na knowledge graph (KG). While the majority of approaches concentrate only on\nrelational information, i.e., relations between entities, fewer approaches\nexist which also take information about literal values (e.g., textual\ndescriptions or numerical information) into account. Those which exist are\ntypically tailored towards a particular modality of literal and a particular\nembedding method. In this paper, we propose a set of universal preprocessing\noperators which can be used to transform KGs with literals for numerical,\ntemporal, textual, and image information, so that the transformed KGs can be\nembedded with any method. The results on the kgbench dataset with three\ndifferent embedding methods show promising results.\n","authors":["Patryk Preisner","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2309.03023v1.pdf","comment":"Accepted for DL4KG Workshop at ISWC 2023"},{"id":"http://arxiv.org/abs/2309.03018v1","updated":"2023-09-06T14:02:33Z","published":"2023-09-06T14:02:33Z","title":"Amortised Inference in Bayesian Neural Networks","summary":" Meta-learning is a framework in which machine learning models train over a\nset of datasets in order to produce predictions on new datasets at test time.\nProbabilistic meta-learning has received an abundance of attention from the\nresearch community in recent years, but a problem shared by many existing\nprobabilistic meta-models is that they require a very large number of datasets\nin order to produce high-quality predictions with well-calibrated uncertainty\nestimates. In many applications, however, such quantities of data are simply\nnot available.\n In this dissertation we present a significantly more data-efficient approach\nto probabilistic meta-learning through per-datapoint amortisation of inference\nin Bayesian neural networks, introducing the Amortised Pseudo-Observation\nVariational Inference Bayesian Neural Network (APOVI-BNN). First, we show that\nthe approximate posteriors obtained under our amortised scheme are of similar\nor better quality to those obtained through traditional variational inference,\ndespite the fact that the amortised inference is performed in a single forward\npass. We then discuss how the APOVI-BNN may be viewed as a new member of the\nneural process family, motivating the use of neural process training objectives\nfor potentially better predictive performance on complex problems as a result.\nFinally, we assess the predictive performance of the APOVI-BNN against other\nprobabilistic meta-models in both a one-dimensional regression problem and in a\nsignificantly more complex image completion setting. In both cases, when the\namount of training data is limited, our model is the best in its class.\n","authors":["Tommy Rochussen"],"pdf_url":"https://arxiv.org/pdf/2309.03018v1.pdf","comment":"This thesis served as the author's final project report for the\n University of Cambridge part IIB Engineering Tripos. 37 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.03014v1","updated":"2023-09-06T13:59:04Z","published":"2023-09-06T13:59:04Z","title":"SymED: Adaptive and Online Symbolic Representation of Data on the Edge","summary":" The edge computing paradigm helps handle the Internet of Things (IoT)\ngenerated data in proximity to its source. Challenges occur in transferring,\nstoring, and processing this rapidly growing amount of data on\nresource-constrained edge devices. Symbolic Representation (SR) algorithms are\npromising solutions to reduce the data size by converting actual raw data into\nsymbols. Also, they allow data analytics (e.g., anomaly detection and trend\nprediction) directly on symbols, benefiting large classes of edge applications.\nHowever, existing SR algorithms are centralized in design and work offline with\nbatch data, which is infeasible for real-time cases. We propose SymED -\nSymbolic Edge Data representation method, i.e., an online, adaptive, and\ndistributed approach for symbolic representation of data on edge. SymED is\nbased on the Adaptive Brownian Bridge-based Aggregation (ABBA), where we assume\nlow-powered IoT devices do initial data compression (senders) and the more\nrobust edge devices do the symbolic conversion (receivers). We evaluate SymED\nby measuring compression performance, reconstruction accuracy through Dynamic\nTime Warping (DTW) distance, and computational latency. The results show that\nSymED is able to (i) reduce the raw data with an average compression rate of\n9.5%; (ii) keep a low reconstruction error of 13.25 in the DTW space; (iii)\nsimultaneously provide real-time adaptability for online streaming IoT data at\ntypical latencies of 42ms per symbol, reducing the overall network traffic.\n","authors":["Daniel Hofstätter","Shashikant Ilager","Ivan Lujic","Ivona Brandic"],"pdf_url":"https://arxiv.org/pdf/2309.03014v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.03004v1","updated":"2023-09-06T13:48:40Z","published":"2023-09-06T13:48:40Z","title":"Theoretical Explanation of Activation Sparsity through Flat Minima and\n Adversarial Robustness","summary":" A recent empirical observation of activation sparsity in MLP layers offers an\nopportunity to drastically reduce computation costs for free. Despite several\nworks attributing it to training dynamics, the theoretical explanation of\nactivation sparsity's emergence is restricted to shallow networks, small\ntraining steps well as modified training, even though the sparsity has been\nfound in deep models trained by vanilla protocols for large steps. To fill the\nthree gaps, we propose the notion of gradient sparsity as the source of\nactivation sparsity and a theoretical explanation based on it that explains\ngradient sparsity and then activation sparsity as necessary steps to\nadversarial robustness w.r.t. hidden features and parameters, which is\napproximately the flatness of minima for well-learned models. The theory\napplies to standardly trained LayerNorm-ed pure MLPs, and further to\nTransformers or other architectures if noises are added to weights during\ntraining. To eliminate other sources of flatness when arguing sparsities'\nnecessity, we discover the phenomenon of spectral concentration, i.e., the\nratio between the largest and the smallest non-zero singular values of weight\nmatrices is small. We utilize random matrix theory (RMT) as a powerful\ntheoretical tool to analyze stochastic gradient noises and discuss the\nemergence of spectral concentration. With these insights, we propose two\nplug-and-play modules for both training from scratch and sparsity finetuning,\nas well as one radical modification that only applies to from-scratch training.\nAnother under-testing module for both sparsity and flatness is also immediate\nfrom our theories. Validational experiments are conducted to verify our\nexplanation. Experiments for productivity demonstrate modifications'\nimprovement in sparsity, indicating further theoretical cost reduction in both\ntraining and inference.\n","authors":["Ze Peng","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2309.03004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.09060v3","updated":"2023-09-06T13:42:28Z","published":"2022-09-19T14:50:48Z","title":"Deep Metric Learning with Chance Constraints","summary":" Deep metric learning (DML) aims to minimize empirical expected loss of the\npairwise intra-/inter- class proximity violations in the embedding space. We\nrelate DML to feasibility problem of finite chance constraints. We show that\nminimizer of proxy-based DML satisfies certain chance constraints, and that the\nworst case generalization performance of the proxy-based methods can be\ncharacterized by the radius of the smallest ball around a class proxy to cover\nthe entire domain of the corresponding class samples, suggesting multiple\nproxies per class helps performance. To provide a scalable algorithm as well as\nexploiting more proxies, we consider the chance constraints implied by the\nminimizers of proxy-based DML instances and reformulate DML as finding a\nfeasible point in intersection of such constraints, resulting in a problem to\nbe approximately solved by iterative projections. Simply put, we repeatedly\ntrain a regularized proxy-based loss and re-initialize the proxies with the\nembeddings of the deliberately selected new samples. We applied our method with\n4 well-accepted DML losses and show the effectiveness with extensive\nevaluations on 4 popular DML benchmarks. Code is available at:\nhttps://github.com/yetigurbuz/ccp-dml\n","authors":["Yeti Z. Gurbuz","Ogul Can","A. Aydin Alatan"],"pdf_url":"https://arxiv.org/pdf/2209.09060v3.pdf","comment":"Accepted as a conference paper at IEEE/CVF Winter Conference on\n Applications of Computer Vision (WACV) 2024"},{"id":"http://arxiv.org/abs/2307.05735v2","updated":"2023-09-06T13:23:39Z","published":"2023-07-11T19:03:17Z","title":"Improving Scientific Machine Learning via Attention and Multiple\n Shooting","summary":" Scientific Machine Learning (SciML) is a burgeoning field that\nsynergistically combines domain-aware and interpretable models with agnostic\nmachine learning techniques. In this work, we introduce GOKU-UI, an evolution\nof the SciML generative model GOKU-nets. GOKU-UI not only broadens the original\nmodel's spectrum to incorporate other classes of differential equations, such\nas Stochastic Differential Equations (SDEs), but also integrates attention\nmechanisms and a novel multiple shooting training strategy in the latent space.\nThese enhancements have led to a significant increase in its performance in\nboth reconstruction and forecast tasks, as demonstrated by our evaluation of\nsimulated and empirical data. Specifically, GOKU-UI outperformed all baseline\nmodels on synthetic datasets even with a training set 16-fold smaller,\nunderscoring its remarkable data efficiency. Furthermore, when applied to\nempirical human brain data, while incorporating stochastic Stuart-Landau\noscillators into its dynamical core, it not only surpassed all baseline methods\nin the reconstruction task, but also demonstrated better prediction of future\nbrain activity up to 15 seconds ahead. By training GOKU-UI on resting state\nfMRI data, we encoded whole-brain dynamics into a latent representation,\nlearning an effective low-dimensional dynamical system model that could offer\ninsights into brain functionality and open avenues for practical applications\nsuch as the classification of mental states or psychiatric conditions.\nUltimately, our research provides further impetus for the field of Scientific\nMachine Learning, showcasing the potential for advancements when established\nscientific insights are interwoven with modern machine learning.\n","authors":["Germán Abrevaya","Mahta Ramezanian-Panahi","Jean-Christophe Gagnon-Audet","Irina Rish","Pablo Polosecki","Silvina Ponce Dawson","Guillermo Cecchi","Guillaume Dumas"],"pdf_url":"https://arxiv.org/pdf/2307.05735v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02976v1","updated":"2023-09-06T13:20:31Z","published":"2023-09-06T13:20:31Z","title":"Natural and Robust Walking using Reinforcement Learning without\n Demonstrations in High-Dimensional Musculoskeletal Models","summary":" Humans excel at robust bipedal walking in complex natural environments. In\neach step, they adequately tune the interaction of biomechanical muscle\ndynamics and neuronal signals to be robust against uncertainties in ground\nconditions. However, it is still not fully understood how the nervous system\nresolves the musculoskeletal redundancy to solve the multi-objective control\nproblem considering stability, robustness, and energy efficiency. In computer\nsimulations, energy minimization has been shown to be a successful optimization\ntarget, reproducing natural walking with trajectory optimization or\nreflex-based control methods. However, these methods focus on particular\nmotions at a time and the resulting controllers are limited when compensating\nfor perturbations. In robotics, reinforcement learning~(RL) methods recently\nachieved highly stable (and efficient) locomotion on quadruped systems, but the\ngeneration of human-like walking with bipedal biomechanical models has required\nextensive use of expert data sets. This strong reliance on demonstrations often\nresults in brittle policies and limits the application to new behaviors,\nespecially considering the potential variety of movements for high-dimensional\nmusculoskeletal models in 3D. Achieving natural locomotion with RL without\nsacrificing its incredible robustness might pave the way for a novel approach\nto studying human walking in complex natural environments.\n","authors":["Pierre Schumacher","Thomas Geijtenbeek","Vittorio Caggiano","Vikash Kumar","Syn Schmitt","Georg Martius","Daniel F. B. Haeufle"],"pdf_url":"https://arxiv.org/pdf/2309.02976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02970v1","updated":"2023-09-06T13:09:01Z","published":"2023-09-06T13:09:01Z","title":"On the Impact of Feeding Cost Risk in Aquaculture Valuation and Decision\n Making","summary":" We study the effect of stochastic feeding costs on animal-based commodities\nwith particular focus on aquaculture. More specifically, we use soybean futures\nto infer on the stochastic behaviour of salmon feed, which we assume to follow\na Schwartz-2-factor model. We compare the decision of harvesting salmon using a\ndecision rule assuming either deterministic or stochastic feeding costs, i.e.\nincluding feeding cost risk. We identify cases, where accounting for stochastic\nfeeding costs leads to significant improvements as well as cases where\ndeterministic feeding costs are a good enough proxy. Nevertheless, in all of\nthese cases, the newly derived rules show superior performance, while the\nadditional computational costs are negligible. From a methodological point of\nview, we demonstrate how to use Deep-Neural-Networks to infer on the decision\nboundary that determines harvesting or continuation, improving on more\nclassical regression-based and curve-fitting methods. To achieve this we use a\ndeep classifier, which not only improves on previous results but also scales\nwell for higher dimensional problems, and in addition mitigates effects due to\nmodel uncertainty, which we identify in this article. effects due to model\nuncertainty, which we identify in this article.\n","authors":["Christian Oliver Ewald","Kevin Kamm"],"pdf_url":"https://arxiv.org/pdf/2309.02970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01860v2","updated":"2023-09-06T13:06:45Z","published":"2023-09-04T23:31:29Z","title":"Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition\n and Translation","summary":" In this paper, we devise a mechanism for the addition of multi-modal\ninformation with an existing pipeline for continuous sign language recognition\nand translation. In our procedure, we have incorporated optical flow\ninformation with RGB images to enrich the features with movement-related\ninformation. This work studies the feasibility of such modality inclusion using\na cross-modal encoder. The plugin we have used is very lightweight and doesn't\nneed to include a separate feature extractor for the new modality in an\nend-to-end manner. We have applied the changes in both sign language\nrecognition and translation, improving the result in each case. We have\nevaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language\nrecognition and the RWTH-PHOENIX-2014T dataset for translation. On the\nrecognition task, our approach reduced the WER by 0.9, and on the translation\ntask, our approach increased most of the BLEU scores by ~0.6 on the test set.\n","authors":["Zaber Ibn Abdul Hakim","Rasman Mubtasim Swargo","Muhammad Abdullah Adnan"],"pdf_url":"https://arxiv.org/pdf/2309.01860v2.pdf","comment":"This version has some errors. Our schedule is packed, so we don't\n have enough time to correct it. We will share another work when we have time\n to fix this"},{"id":"http://arxiv.org/abs/2309.02968v1","updated":"2023-09-06T13:05:42Z","published":"2023-09-06T13:05:42Z","title":"CR-VAE: Contrastive Regularization on Variational Autoencoders for\n Preventing Posterior Collapse","summary":" The Variational Autoencoder (VAE) is known to suffer from the phenomenon of\n\\textit{posterior collapse}, where the latent representations generated by the\nmodel become independent of the inputs. This leads to degenerated\nrepresentations of the input, which is attributed to the limitations of the\nVAE's objective function. In this work, we propose a novel solution to this\nissue, the Contrastive Regularization for Variational Autoencoders (CR-VAE).\nThe core of our approach is to augment the original VAE with a contrastive\nobjective that maximizes the mutual information between the representations of\nsimilar visual inputs. This strategy ensures that the information flow between\nthe input and its latent representation is maximized, effectively avoiding\nposterior collapse. We evaluate our method on a series of visual datasets and\ndemonstrate, that CR-VAE outperforms state-of-the-art approaches in preventing\nposterior collapse.\n","authors":["Fotios Lygerakis. Elmar Rueckert"],"pdf_url":"https://arxiv.org/pdf/2309.02968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03428v3","updated":"2023-09-06T12:57:40Z","published":"2023-03-06T19:00:27Z","title":"Towards provably efficient quantum algorithms for large-scale\n machine-learning models","summary":" Large machine learning models are revolutionary technologies of artificial\nintelligence whose bottlenecks include huge computational expenses, power, and\ntime used both in the pre-training and fine-tuning process. In this work, we\nshow that fault-tolerant quantum computing could possibly provide provably\nefficient resolutions for generic (stochastic) gradient descent algorithms,\nscaling as $\\mathcal{O}(T^2 \\times \\text{polylog}(n))$, where $n$ is the size\nof the models and $T$ is the number of iterations in the training, as long as\nthe models are both sufficiently dissipative and sparse, with small learning\nrates. Based on earlier efficient quantum algorithms for dissipative\ndifferential equations, we find and prove that similar algorithms work for\n(stochastic) gradient descent, the primary algorithm for machine learning. In\npractice, we benchmark instances of large machine learning models from 7\nmillion to 103 million parameters. We find that, in the context of sparse\ntraining, a quantum enhancement is possible at the early stage of learning\nafter model pruning, motivating a sparse parameter download and re-upload\nscheme. Our work shows solidly that fault-tolerant quantum algorithms could\npotentially contribute to most state-of-the-art, large-scale machine-learning\nproblems.\n","authors":["Junyu Liu","Minzhao Liu","Jin-Peng Liu","Ziyu Ye","Yuri Alexeev","Jens Eisert","Liang Jiang"],"pdf_url":"https://arxiv.org/pdf/2303.03428v3.pdf","comment":"6+39 pages, 3+10 figures, substantial detail added"},{"id":"http://arxiv.org/abs/2309.02954v1","updated":"2023-09-06T12:43:18Z","published":"2023-09-06T12:43:18Z","title":"M3D-NCA: Robust 3D Segmentation with Built-in Quality Control","summary":" Medical image segmentation relies heavily on large-scale deep learning\nmodels, such as UNet-based architectures. However, the real-world utility of\nsuch models is limited by their high computational requirements, which makes\nthem impractical for resource-constrained environments such as primary care\nfacilities and conflict zones. Furthermore, shifts in the imaging domain can\nrender these models ineffective and even compromise patient safety if such\nerrors go undetected. To address these challenges, we propose M3D-NCA, a novel\nmethodology that leverages Neural Cellular Automata (NCA) segmentation for 3D\nmedical images using n-level patchification. Moreover, we exploit the variance\nin M3D-NCA to develop a novel quality metric which can automatically detect\nerrors in the segmentation process of NCAs. M3D-NCA outperforms the two\nmagnitudes larger UNet models in hippocampus and prostate segmentation by 2%\nDice and can be run on a Raspberry Pi 4 Model B (2GB RAM). This highlights the\npotential of M3D-NCA as an effective and efficient alternative for medical\nimage segmentation in resource-constrained environments.\n","authors":["John Kalkhof","Anirban Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2309.02954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04497v2","updated":"2023-09-06T12:41:48Z","published":"2023-04-10T10:22:21Z","title":"A Unified Framework for Exploratory Learning-Aided Community Detection\n in Networks with Unknown Topology","summary":" In social networks, the discovery of community structures has received\nconsiderable attention as a fundamental problem in various network analysis\ntasks. However, due to privacy concerns or access restrictions, the network\nstructure is often unknown, thereby rendering established community detection\napproaches ineffective without costly network topology acquisition. To tackle\nthis challenge, we present META-CODE, a unified framework for detecting\noverlapping communities in networks with unknown topology via exploratory\nlearning aided by easy-to-collect node metadata. Specifically, META-CODE\nconsists of three iterative steps in addition to the initial network inference\nstep: 1) node-level community-affiliation embeddings based on graph neural\nnetworks (GNNs) trained by our new reconstruction loss, 2) network exploration\nvia community-affiliation-based node queries, and 3) network inference using an\nedge connectivity-based Siamese neural network model from the explored network.\nThrough extensive experiments on five real-world datasets including two large\nnetworks, we demonstrated: (a) the superiority of META-CODE over benchmark\ncommunity detection methods, achieving remarkable gains up to 151.27% compared\nto the best existing competitor, (b) the impact of each module in META-CODE,\n(c) the effectiveness of node queries in META-CODE based on empirical\nevaluations and theoretical findings, (d) the convergence of the inferred\nnetwork, and (e) the computational efficiency of META-CODE.\n","authors":["Yu Hou","Cong Tran","Ming Li","Won-Yong Shin"],"pdf_url":"https://arxiv.org/pdf/2304.04497v2.pdf","comment":"16 pages, 9 figures, 6 tables; its conference version was presented\n at the ACM International Conference on Information and Knowledge Management\n (CIKM 2022)"},{"id":"http://arxiv.org/abs/2304.06044v2","updated":"2023-09-06T12:31:38Z","published":"2023-04-10T19:58:49Z","title":"Learning solution of nonlinear constitutive material models using\n physics-informed neural networks: COMM-PINN","summary":" We applied physics-informed neural networks to solve the constitutive\nrelations for nonlinear, path-dependent material behavior. As a result, the\ntrained network not only satisfies all thermodynamic constraints but also\ninstantly provides information about the current material state (i.e., free\nenergy, stress, and the evolution of internal variables) under any given\nloading scenario without requiring initial data. One advantage of this work is\nthat it bypasses the repetitive Newton iterations needed to solve nonlinear\nequations in complex material models. Additionally, strategies are provided to\nreduce the required order of derivative for obtaining the tangent operator. The\ntrained model can be directly used in any finite element package (or other\nnumerical methods) as a user-defined material model. However, challenges remain\nin the proper definition of collocation points and in integrating several\nnon-equality constraints that become active or non-active simultaneously. We\ntested this methodology on rate-independent processes such as the classical von\nMises plasticity model with a nonlinear hardening law, as well as local damage\nmodels for interface cracking behavior with a nonlinear softening law. In order\nto demonstrate the applicability of the methodology in handling complex path\ndependency in a three-dimensional (3D) scenario, we tested the approach using\nthe equations governing a damage model for a three-dimensional interface model.\nSuch models are frequently employed for intergranular fracture at grain\nboundaries. We have observed a perfect agreement between the results obtained\nthrough the proposed methodology and those obtained using the classical\napproach. Furthermore, the proposed approach requires significantly less effort\nin terms of implementation and computing time compared to the traditional\nmethods.\n","authors":["Shahed Rezaei","Ahmad Moeineddin","Ali Harandi"],"pdf_url":"https://arxiv.org/pdf/2304.06044v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12655v3","updated":"2023-09-06T12:20:43Z","published":"2022-01-29T20:39:58Z","title":"Error Scaling Laws for Kernel Classification under Source and Capacity\n Conditions","summary":" We consider the problem of kernel classification. While worst-case bounds on\nthe decay rate of the prediction error with the number of samples are known for\nsome classifiers, they often fail to accurately describe the learning curves of\nreal data sets. In this work, we consider the important class of data sets\nsatisfying the standard source and capacity conditions, comprising a number of\nreal data sets as we show numerically. Under the Gaussian design, we derive the\ndecay rates for the misclassification (prediction) error as a function of the\nsource and capacity coefficients. We do so for two standard kernel\nclassification settings, namely margin-maximizing Support Vector Machines (SVM)\nand ridge classification, and contrast the two methods. We find that our rates\ntightly describe the learning curves for this class of data sets, and are also\nobserved on real data. Our results can also be seen as an explicit prediction\nof the exponents of a scaling law for kernel classification that is accurate on\nsome real datasets.\n","authors":["Hugo Cui","Bruno Loureiro","Florent Krzakala","Lenka Zdeborová"],"pdf_url":"https://arxiv.org/pdf/2201.12655v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09164v2","updated":"2023-09-06T12:15:06Z","published":"2023-01-22T17:12:58Z","title":"Unifying Synergies between Self-supervised Learning and Dynamic\n Computation","summary":" Computationally expensive training strategies make self-supervised learning\n(SSL) impractical for resource constrained industrial settings. Techniques like\nknowledge distillation (KD), dynamic computation (DC), and pruning are often\nused to obtain a lightweightmodel, which usually involves multiple epochs of\nfine-tuning (or distilling steps) of a large pre-trained model, making it more\ncomputationally challenging. In this work we present a novel perspective on the\ninterplay between SSL and DC paradigms. In particular, we show that it is\nfeasible to simultaneously learn a dense and gated sub-network from scratch in\na SSL setting without any additional fine-tuning or pruning steps. The\nco-evolution during pre-training of both dense and gated encoder offers a good\naccuracy-efficiency trade-off and therefore yields a generic and multi-purpose\narchitecture for application specific industrial settings. Extensive\nexperiments on several image classification benchmarks including CIFAR-10/100,\nSTL-10 and ImageNet-100, demonstrate that the proposed training strategy\nprovides a dense and corresponding gated sub-network that achieves on-par\nperformance compared with the vanilla self-supervised setting, but at a\nsignificant reduction in computation in terms of FLOPs, under a range of target\nbudgets (td ).\n","authors":["Tarun Krishna","Ayush K Rai","Alexandru Drimbarean","Eric Arazo","Paul Albert","Alan F Smeaton","Kevin McGuinness","Noel E O'Connor"],"pdf_url":"https://arxiv.org/pdf/2301.09164v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02935v1","updated":"2023-09-06T11:55:16Z","published":"2023-09-06T11:55:16Z","title":"Estimating irregular water demands with physics-informed machine\n learning to inform leakage detection","summary":" Leakages in drinking water distribution networks pose significant challenges\nto water utilities, leading to infrastructure failure, operational disruptions,\nenvironmental hazards, property damage, and economic losses. The timely\nidentification and accurate localisation of such leakages is paramount for\nutilities to mitigate these unwanted effects. However, implementation of\nalgorithms for leakage detection is limited in practice by requirements of\neither hydraulic models or large amounts of training data. Physics-informed\nmachine learning can utilise hydraulic information thereby circumventing both\nlimitations. In this work, we present a physics-informed machine learning\nalgorithm that analyses pressure data and therefrom estimates unknown irregular\nwater demands via a fully connected neural network, ultimately leveraging the\nBernoulli equation and effectively linearising the leakage detection problem.\nOur algorithm is tested on data from the L-Town benchmark network, and results\nindicate a good capability for estimating most irregular demands, with R2\nlarger than 0.8. Identification results for leakages under the presence of\nirregular demands could be improved by a factor of 5.3 for abrupt leaks and a\nfactor of 3.0 for incipient leaks when compared the results disregarding\nirregular demands.\n","authors":["Ivo Daniel","Andrea Cominola"],"pdf_url":"https://arxiv.org/pdf/2309.02935v1.pdf","comment":"submitted to Water Research on July 17th, 2023"},{"id":"http://arxiv.org/abs/2305.13233v3","updated":"2023-09-06T11:51:36Z","published":"2023-05-22T17:05:34Z","title":"Estimating Gibbs free energies via isobaric-isothermal flows","summary":" We present a machine-learning model based on normalizing flows that is\ntrained to sample from the isobaric-isothermal ensemble. In our approach, we\napproximate the joint distribution of a fully-flexible triclinic simulation box\nand particle coordinates to achieve a desired internal pressure. This novel\nextension of flow-based sampling to the isobaric-isothermal ensemble yields\ndirect estimates of Gibbs free energies. We test our NPT-flow on monatomic\nwater in the cubic and hexagonal ice phases and find excellent agreement of\nGibbs free energies and other observables compared with established baselines.\n","authors":["Peter Wirnsberger","Borja Ibarz","George Papamakarios"],"pdf_url":"https://arxiv.org/pdf/2305.13233v3.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.01866v2","updated":"2023-09-06T11:50:03Z","published":"2023-09-05T00:14:12Z","title":"Efficient Query-Based Attack against ML-Based Android Malware Detection\n under Zero Knowledge Setting","summary":" The widespread adoption of the Android operating system has made malicious\nAndroid applications an appealing target for attackers. Machine learning-based\n(ML-based) Android malware detection (AMD) methods are crucial in addressing\nthis problem; however, their vulnerability to adversarial examples raises\nconcerns. Current attacks against ML-based AMD methods demonstrate remarkable\nperformance but rely on strong assumptions that may not be realistic in\nreal-world scenarios, e.g., the knowledge requirements about feature space,\nmodel parameters, and training dataset. To address this limitation, we\nintroduce AdvDroidZero, an efficient query-based attack framework against\nML-based AMD methods that operates under the zero knowledge setting. Our\nextensive evaluation shows that AdvDroidZero is effective against various\nmainstream ML-based AMD methods, in particular, state-of-the-art such methods\nand real-world antivirus solutions.\n","authors":["Ping He","Yifan Xia","Xuhong Zhang","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2309.01866v2.pdf","comment":"To Appear in the ACM Conference on Computer and Communications\n Security, November, 2023"},{"id":"http://arxiv.org/abs/2305.18394v4","updated":"2023-09-06T11:47:47Z","published":"2023-05-28T12:34:07Z","title":"On Optimal Regularization Parameters via Bilevel Learning","summary":" Variational regularization is commonly used to solve linear inverse problems,\nand involves augmenting a data fidelity by a regularizer. The regularizer is\nused to promote a priori information and is weighted by a regularization\nparameter. Selection of an appropriate regularization parameter is critical,\nwith various choices leading to very different reconstructions. Classical\nstrategies used to determine a suitable parameter value include the discrepancy\nprinciple and the L-curve criterion, and in recent years a supervised machine\nlearning approach called bilevel learning has been employed. Bilevel learning\nis a powerful framework to determine optimal parameters and involves solving a\nnested optimization problem. While previous strategies enjoy various\ntheoretical results, the well-posedness of bilevel learning in this setting is\nstill an open question. In particular, a necessary property is positivity of\nthe determined regularization parameter. In this work, we provide a new\ncondition that better characterizes positivity of optimal regularization\nparameters than the existing theory. Numerical results verify and explore this\nnew condition for both small and high-dimensional problems.\n","authors":["Matthias J. Ehrhardt","Silvia Gazzola","Sebastian J. Scott"],"pdf_url":"https://arxiv.org/pdf/2305.18394v4.pdf","comment":"32 pages, 11 figures. Restructured and streamlined proof. Provided\n more numerical results"},{"id":"http://arxiv.org/abs/2308.16453v3","updated":"2023-09-06T11:38:54Z","published":"2023-08-31T04:45:44Z","title":"Listen to Minority: Encrypted Traffic Classification for Class Imbalance\n with Contrastive Pre-Training","summary":" Mobile Internet has profoundly reshaped modern lifestyles in various aspects.\nEncrypted Traffic Classification (ETC) naturally plays a crucial role in\nmanaging mobile Internet, especially with the explosive growth of mobile apps\nusing encrypted communication. Despite some existing learning-based ETC methods\nshowing promising results, three-fold limitations still remain in real-world\nnetwork environments, 1) label bias caused by traffic class imbalance, 2)\ntraffic homogeneity caused by component sharing, and 3) training with reliance\non sufficient labeled traffic. None of the existing ETC methods can address all\nthese limitations. In this paper, we propose a novel Pre-trAining\nSemi-Supervised ETC framework, dubbed PASS. Our key insight is to resample the\noriginal train dataset and perform contrastive pre-training without using\nindividual app labels directly to avoid label bias issues caused by class\nimbalance, while obtaining a robust feature representation to differentiate\noverlapping homogeneous traffic by pulling positive traffic pairs closer and\npushing negative pairs away. Meanwhile, PASS designs a semi-supervised\noptimization strategy based on pseudo-label iteration and dynamic loss\nweighting algorithms in order to effectively utilize massive unlabeled traffic\ndata and alleviate manual train dataset annotation workload. PASS outperforms\nstate-of-the-art ETC methods and generic sampling approaches on four public\ndatasets with significant class imbalance and traffic homogeneity, remarkably\npushing the F1 of Cross-Platform215 with 1.31%, ISCX-17 with 9.12%.\nFurthermore, we validate the generality of the contrastive pre-training and\npseudo-label iteration components of PASS, which can adaptively benefit ETC\nmethods with diverse feature extractors.\n","authors":["Xiang Li","Juncheng Guo","Qige Song","Jiang Xie","Yafei Sang","Shuyuan Zhao","Yongzheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16453v3.pdf","comment":"Accepted by 2023 20th Annual IEEE International Conference on\n Sensing, Communication, and Networking, 9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2212.01071v2","updated":"2023-09-06T11:38:00Z","published":"2022-12-02T10:22:18Z","title":"Fake detection in imbalance dataset by Semi-supervised learning with GAN","summary":" As social media grows faster, harassment becomes more prevalent which leads\nto considered fake detection a fascinating field among researchers. The graph\nnature of data with the large number of nodes caused different obstacles\nincluding a considerable amount of unrelated features in matrices as high\ndispersion and imbalance classes in the dataset. To deal with these issues\nAuto-encoders and a combination of semi-supervised learning and the GAN\nalgorithm which is called SGAN were used. This paper is deploying a smaller\nnumber of labels and applying SGAN as a classifier. The result of this test\nshowed that the accuracy had reached 91\\% in detecting fake accounts using only\n100 labeled samples.\n","authors":["Jinus Bordbar","Saman Ardalan","Mohammadreza Mohammadrezaie","Mohammad Ebrahim Shiri"],"pdf_url":"https://arxiv.org/pdf/2212.01071v2.pdf","comment":"have a more complete script in this subject"},{"id":"http://arxiv.org/abs/2309.02917v1","updated":"2023-09-06T11:22:21Z","published":"2023-09-06T11:22:21Z","title":"GroupEnc: encoder with group loss for global structure preservation","summary":" Recent advances in dimensionality reduction have achieved more accurate\nlower-dimensional embeddings of high-dimensional data. In addition to\nvisualisation purposes, these embeddings can be used for downstream processing,\nincluding batch effect normalisation, clustering, community detection or\ntrajectory inference. We use the notion of structure preservation at both local\nand global levels to create a deep learning model, based on a variational\nautoencoder (VAE) and the stochastic quartet loss from the SQuadMDS algorithm.\nOur encoder model, called GroupEnc, uses a 'group loss' function to create\nembeddings with less global structure distortion than VAEs do, while keeping\nthe model parametric and the architecture flexible. We validate our approach\nusing publicly available biological single-cell transcriptomic datasets,\nemploying RNX curves for evaluation.\n","authors":["David Novak","Sofie Van Gassen","Yvan Saeys"],"pdf_url":"https://arxiv.org/pdf/2309.02917v1.pdf","comment":"Submitted to BNAIC/BeNeLearn 2023"},{"id":"http://arxiv.org/abs/2309.02915v1","updated":"2023-09-06T11:20:41Z","published":"2023-09-06T11:20:41Z","title":"Persona-aware Generative Model for Code-mixed Language","summary":" Code-mixing and script-mixing are prevalent across online social networks and\nmultilingual societies. However, a user's preference toward code-mixing depends\non the socioeconomic status, demographics of the user, and the local context,\nwhich existing generative models mostly ignore while generating code-mixed\ntexts. In this work, we make a pioneering attempt to develop a persona-aware\ngenerative model to generate texts resembling real-life code-mixed texts of\nindividuals. We propose a Persona-aware Generative Model for Code-mixed\nGeneration, PARADOX, a novel Transformer-based encoder-decoder model that\nencodes an utterance conditioned on a user's persona and generates code-mixed\ntexts without monolingual reference data. We propose an alignment module that\nre-calibrates the generated sequence to resemble real-life code-mixed texts.\nPARADOX generates code-mixed texts that are semantically more meaningful and\nlinguistically more valid. To evaluate the personification capabilities of\nPARADOX, we propose four new metrics -- CM BLEU, CM Rouge-1, CM Rouge-L and CM\nKS. On average, PARADOX achieves 1.6 points better CM BLEU, 47% better\nperplexity and 32% better semantic coherence than the non-persona-based\ncounterparts.\n","authors":["Ayan Sengupta","Md Shad Akhtar","Tanmoy Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2309.02915v1.pdf","comment":"4 tables, 4 figures"},{"id":"http://arxiv.org/abs/2302.03347v3","updated":"2023-09-06T11:20:28Z","published":"2023-02-07T09:41:21Z","title":"An Informative Path Planning Framework for Active Learning in UAV-based\n Semantic Mapping","summary":" Unmanned aerial vehicles (UAVs) are frequently used for aerial mapping and\ngeneral monitoring tasks. Recent progress in deep learning enabled automated\nsemantic segmentation of imagery to facilitate the interpretation of\nlarge-scale complex environments. Commonly used supervised deep learning for\nsegmentation relies on large amounts of pixel-wise labelled data, which is\ntedious and costly to annotate. The domain-specific visual appearance of aerial\nenvironments often prevents the usage of models pre-trained on publicly\navailable datasets. To address this, we propose a novel general planning\nframework for UAVs to autonomously acquire informative training images for\nmodel re-training. We leverage multiple acquisition functions and fuse them\ninto probabilistic terrain maps. Our framework combines the mapped acquisition\nfunction information into the UAV's planning objectives. In this way, the UAV\nadaptively acquires informative aerial images to be manually labelled for model\nre-training. Experimental results on real-world data and in a photorealistic\nsimulation show that our framework maximises model performance and drastically\nreduces labelling efforts. Our map-based planners outperform state-of-the-art\nlocal planning.\n","authors":["Julius Rückin","Federico Magistri","Cyrill Stachniss","Marija Popović"],"pdf_url":"https://arxiv.org/pdf/2302.03347v3.pdf","comment":"18 pages, 24 figures"},{"id":"http://arxiv.org/abs/2309.02913v1","updated":"2023-09-06T11:19:26Z","published":"2023-09-06T11:19:26Z","title":"Ensemble DNN for Age-of-Information Minimization in UAV-assisted\n Networks","summary":" This paper addresses the problem of Age-of-Information (AoI) in UAV-assisted\nnetworks. Our objective is to minimize the expected AoI across devices by\noptimizing UAVs' stopping locations and device selection probabilities. To\ntackle this problem, we first derive a closed-form expression of the expected\nAoI that involves the probabilities of selection of devices. Then, we formulate\nthe problem as a non-convex minimization subject to quality of service\nconstraints. Since the problem is challenging to solve, we propose an Ensemble\nDeep Neural Network (EDNN) based approach which takes advantage of the dual\nformulation of the studied problem. Specifically, the Deep Neural Networks\n(DNNs) in the ensemble are trained in an unsupervised manner using the\nLagrangian function of the studied problem. Our experiments show that the\nproposed EDNN method outperforms traditional DNNs in reducing the expected AoI,\nachieving a remarkable reduction of $29.5\\%$.\n","authors":["Mouhamed Naby Ndiaye","El Houcine Bergou","Hajar El Hammouti"],"pdf_url":"https://arxiv.org/pdf/2309.02913v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.02911v1","updated":"2023-09-06T11:13:34Z","published":"2023-09-06T11:13:34Z","title":"A Multimodal Learning Framework for Comprehensive 3D Mineral\n Prospectivity Modeling with Jointly Learned Structure-Fluid Relationships","summary":" This study presents a novel multimodal fusion model for three-dimensional\nmineral prospectivity mapping (3D MPM), effectively integrating structural and\nfluid information through a deep network architecture. Leveraging Convolutional\nNeural Networks (CNN) and Multilayer Perceptrons (MLP), the model employs\ncanonical correlation analysis (CCA) to align and fuse multimodal features.\nRigorous evaluation on the Jiaojia gold deposit dataset demonstrates the\nmodel's superior performance in distinguishing ore-bearing instances and\npredicting mineral prospectivity, outperforming other models in result\nanalyses. Ablation studies further reveal the benefits of joint feature\nutilization and CCA incorporation. This research not only advances mineral\nprospectivity modeling but also highlights the pivotal role of data integration\nand feature alignment for enhanced exploration decision-making.\n","authors":["Yang Zheng","Hao Deng","Ruisheng Wang","Jingjie Wu"],"pdf_url":"https://arxiv.org/pdf/2309.02911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02908v1","updated":"2023-09-06T11:02:53Z","published":"2023-09-06T11:02:53Z","title":"DECODE: Data-driven Energy Consumption Prediction leveraging Historical\n Data and Environmental Factors in Buildings","summary":" Energy prediction in buildings plays a crucial role in effective energy\nmanagement. Precise predictions are essential for achieving optimal energy\nconsumption and distribution within the grid. This paper introduces a Long\nShort-Term Memory (LSTM) model designed to forecast building energy consumption\nusing historical energy data, occupancy patterns, and weather conditions. The\nLSTM model provides accurate short, medium, and long-term energy predictions\nfor residential and commercial buildings compared to existing prediction\nmodels. We compare our LSTM model with established prediction methods,\nincluding linear regression, decision trees, and random forest. Encouragingly,\nthe proposed LSTM model emerges as the superior performer across all metrics.\nIt demonstrates exceptional prediction accuracy, boasting the highest R2 score\nof 0.97 and the most favorable mean absolute error (MAE) of 0.007. An\nadditional advantage of our developed model is its capacity to achieve\nefficient energy consumption forecasts even when trained on a limited dataset.\nWe address concerns about overfitting (variance) and underfitting (bias)\nthrough rigorous training and evaluation on real-world data. In summary, our\nresearch contributes to energy prediction by offering a robust LSTM model that\noutperforms alternative methods and operates with remarkable efficiency,\ngeneralizability, and reliability.\n","authors":["Aditya Mishra","Haroon R. Lone","Aayush Mishra"],"pdf_url":"https://arxiv.org/pdf/2309.02908v1.pdf","comment":"11 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.02898v1","updated":"2023-09-06T10:41:30Z","published":"2023-09-06T10:41:30Z","title":"A Unified Framework for Discovering Discrete Symmetries","summary":" We consider the problem of learning a function respecting a symmetry from\namong a class of symmetries. We develop a unified framework that enables\nsymmetry discovery across a broad range of subgroups including locally\nsymmetric, dihedral and cyclic subgroups. At the core of the framework is a\nnovel architecture composed of linear and tensor-valued functions that\nexpresses functions invariant to these subgroups in a principled manner. The\nstructure of the architecture enables us to leverage multi-armed bandit\nalgorithms and gradient descent to efficiently optimize over the linear and the\ntensor-valued functions, respectively, and to infer the symmetry that is\nultimately learnt. We also discuss the necessity of the tensor-valued functions\nin the architecture. Experiments on image-digit sum and polynomial regression\ntasks demonstrate the effectiveness of our approach.\n","authors":["Pavan Karjol","Rohan Kashyap","Aditya Gopalan","Prathosh A. P"],"pdf_url":"https://arxiv.org/pdf/2309.02898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06237v2","updated":"2023-09-06T10:13:09Z","published":"2023-04-13T03:20:45Z","title":"Deep learning based ECG segmentation for delineation of diverse\n arrhythmias","summary":" Accurate delineation of key waveforms in an ECG is a critical initial step in\nextracting relevant features to support the diagnosis and treatment of heart\nconditions. Although deep learning based methods using a segmentation model to\nlocate the P, QRS, and T waves have shown promising results, their ability to\nhandle signals exhibiting arrhythmia remains unclear. This study builds on\nexisting research by introducing a U-Net-like segmentation model for ECG\ndelineation, with a particular focus on diverse arrhythmias. For this purpose,\nwe curate an internal dataset containing waveform boundary annotations for\nvarious arrhythmia types to train and validate our model. Our key contributions\ninclude identifying segmentation model failures in different arrhythmia types,\ndeveloping a robust model using a diverse training set, achieving comparable\nperformance on benchmark datasets, and introducing a classification guided\nstrategy to reduce false P wave predictions for specific arrhythmias. This\nstudy advances deep learning based ECG delineation in the context of\narrhythmias and highlights its challenges.\n","authors":["Chankyu Joung","Mijin Kim","Taejin Paik","Seong-Ho Kong","Seung-Young Oh","Won Kyeong Jeon","Jae-hu Jeon","Joong-Sik Hong","Wan-Joong Kim","Woong Kook","Myung-Jin Cha","Otto van Koert"],"pdf_url":"https://arxiv.org/pdf/2304.06237v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02876v1","updated":"2023-09-06T10:02:58Z","published":"2023-09-06T10:02:58Z","title":"Non-Clashing Teaching Maps for Balls in Graphs","summary":" Recently, Kirkpatrick et al. [ALT 2019] and Fallat et al. [JMLR 2023]\nintroduced non-clashing teaching and showed it to be the most efficient machine\nteaching model satisfying the benchmark for collusion-avoidance set by Goldman\nand Mathias. A teaching map $T$ for a concept class $\\cal{C}$ assigns a\n(teaching) set $T(C)$ of examples to each concept $C \\in \\cal{C}$. A teaching\nmap is non-clashing if no pair of concepts are consistent with the union of\ntheir teaching sets. The size of a non-clashing teaching map (NCTM) $T$ is the\nmaximum size of a $T(C)$, $C \\in \\cal{C}$. The non-clashing teaching dimension\nNCTD$(\\cal{C})$ of $\\cal{C}$ is the minimum size of an NCTM for $\\cal{C}$.\nNCTM$^+$ and NCTD$^+(\\cal{C})$ are defined analogously, except the teacher may\nonly use positive examples.\n We study NCTMs and NCTM$^+$s for the concept class $\\mathcal{B}(G)$\nconsisting of all balls of a graph $G$. We show that the associated decision\nproblem {\\sc B-NCTD$^+$} for NCTD$^+$ is NP-complete in split, co-bipartite,\nand bipartite graphs. Surprisingly, we even prove that, unless the ETH fails,\n{\\sc B-NCTD$^+$} does not admit an algorithm running in time\n$2^{2^{o(vc)}}\\cdot n^{O(1)}$, nor a kernelization algorithm outputting a\nkernel with $2^{o(vc)}$ vertices, where vc is the vertex cover number of $G$.\nThese are extremely rare results: it is only the second (fourth, resp.) problem\nin NP to admit a double-exponential lower bound parameterized by vc (treewidth,\nresp.), and only one of very few problems to admit an ETH-based conditional\nlower bound on the number of vertices in a kernel. We complement these lower\nbounds with matching upper bounds. For trees, interval graphs, cycles, and\ntrees of cycles, we derive NCTM$^+$s or NCTMs for $\\mathcal{B}(G)$ of size\nproportional to its VC-dimension. For Gromov-hyperbolic graphs, we design an\napproximate NCTM$^+$ for $\\mathcal{B}(G)$ of size 2.\n","authors":["Jérémie Chalopin","Victor Chepoi","Fionn Mc Inerney","Sébastien Ratel"],"pdf_url":"https://arxiv.org/pdf/2309.02876v1.pdf","comment":"Shortened abstract due to character limit"},{"id":"http://arxiv.org/abs/2309.02873v1","updated":"2023-09-06T09:57:58Z","published":"2023-09-06T09:57:58Z","title":"Learning Hybrid Dynamics Models With Simulator-Informed Latent States","summary":" Dynamics model learning deals with the task of inferring unknown dynamics\nfrom measurement data and predicting the future behavior of the system. A\ntypical approach to address this problem is to train recurrent models. However,\npredictions with these models are often not physically meaningful. Further,\nthey suffer from deteriorated behavior over time due to accumulating errors.\nOften, simulators building on first principles are available being physically\nmeaningful by design. However, modeling simplifications typically cause\ninaccuracies in these models. Consequently, hybrid modeling is an emerging\ntrend that aims to combine the best of both worlds. In this paper, we propose a\nnew approach to hybrid modeling, where we inform the latent states of a learned\nmodel via a black-box simulator. This allows to control the predictions via the\nsimulator preventing them from accumulating errors. This is especially\nchallenging since, in contrast to previous approaches, access to the\nsimulator's latent states is not available. We tackle the task by leveraging\nobservers, a well-known concept from control theory, inferring unknown latent\nstates from observations and dynamics over time. In our learning-based setting,\nwe jointly learn the dynamics and an observer that infers the latent states via\nthe simulator. Thus, the simulator constantly corrects the latent states,\ncompensating for modeling mismatch caused by learning. To maintain flexibility,\nwe train an RNN-based residuum for the latent states that cannot be informed by\nthe simulator.\n","authors":["Katharina Ensinger","Sebastian Ziesche","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2309.02873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02870v1","updated":"2023-09-06T09:49:20Z","published":"2023-09-06T09:49:20Z","title":"Rethinking Momentum Knowledge Distillation in Online Continual Learning","summary":" Online Continual Learning (OCL) addresses the problem of training neural\nnetworks on a continuous data stream where multiple classification tasks emerge\nin sequence. In contrast to offline Continual Learning, data can be seen only\nonce in OCL. In this context, replay-based strategies have achieved impressive\nresults and most state-of-the-art approaches are heavily depending on them.\nWhile Knowledge Distillation (KD) has been extensively used in offline\nContinual Learning, it remains under-exploited in OCL, despite its potential.\nIn this paper, we theoretically analyze the challenges in applying KD to OCL.\nWe introduce a direct yet effective methodology for applying Momentum Knowledge\nDistillation (MKD) to many flagship OCL methods and demonstrate its\ncapabilities to enhance existing approaches. In addition to improving existing\nstate-of-the-arts accuracy by more than $10\\%$ points on ImageNet100, we shed\nlight on MKD internal mechanics and impacts during training in OCL. We argue\nthat similar to replay, MKD should be considered a central component of OCL.\n","authors":["Nicolas Michel","Maorong Wang","Ling Xiao","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2309.02870v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2309.02869v1","updated":"2023-09-06T09:47:36Z","published":"2023-09-06T09:47:36Z","title":"On Reducing Undesirable Behavior in Deep Reinforcement Learning Models","summary":" Deep reinforcement learning (DRL) has proven extremely useful in a large\nvariety of application domains. However, even successful DRL-based software can\nexhibit highly undesirable behavior. This is due to DRL training being based on\nmaximizing a reward function, which typically captures general trends but\ncannot precisely capture, or rule out, certain behaviors of the system. In this\npaper, we propose a novel framework aimed at drastically reducing the\nundesirable behavior of DRL-based software, while maintaining its excellent\nperformance. In addition, our framework can assist in providing engineers with\na comprehensible characterization of such undesirable behavior. Under the hood,\nour approach is based on extracting decision tree classifiers from erroneous\nstate-action pairs, and then integrating these trees into the DRL training\nloop, penalizing the system whenever it performs an error. We provide a\nproof-of-concept implementation of our approach, and use it to evaluate the\ntechnique on three significant case studies. We find that our approach can\nextend existing frameworks in a straightforward manner, and incurs only a\nslight overhead in training time. Further, it incurs only a very slight hit to\nperformance, or even in some cases - improves it, while significantly reducing\nthe frequency of undesirable behavior.\n","authors":["Ophir Carmel","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2309.02869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02868v1","updated":"2023-09-06T09:47:03Z","published":"2023-09-06T09:47:03Z","title":"Enhancing Event Sequence Modeling with Contrastive Relational Inference","summary":" Neural temporal point processes(TPPs) have shown promise for modeling\ncontinuous-time event sequences. However, capturing the interactions between\nevents is challenging yet critical for performing inference tasks like\nforecasting on event sequence data. Existing TPP models have focused on\nparameterizing the conditional distribution of future events but struggle to\nmodel event interactions. In this paper, we propose a novel approach that\nleverages Neural Relational Inference (NRI) to learn a relation graph that\ninfers interactions while simultaneously learning the dynamics patterns from\nobservational data. Our approach, the Contrastive Relational Inference-based\nHawkes Process (CRIHP), reasons about event interactions under a variational\ninference framework. It utilizes intensity-based learning to search for\nprototype paths to contrast relationship constraints. Extensive experiments on\nthree real-world datasets demonstrate the effectiveness of our model in\ncapturing event interactions for event sequence modeling tasks.\n","authors":["Yan Wang","Zhixuan Chu","Tao Zhou","Caigao Jiang","Hongyan Hao","Minjie Zhu","Xindong Cai","Qing Cui","Longfei Li","James Y Zhang","Siqiao Xue","Jun Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.02868v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.02858v1","updated":"2023-09-06T09:39:33Z","published":"2023-09-06T09:39:33Z","title":"Generalised Mutual Information: a Framework for Discriminative\n Clustering","summary":" In the last decade, recent successes in deep clustering majorly involved the\nMutual Information (MI) as an unsupervised objective for training neural\nnetworks with increasing regularisations. While the quality of the\nregularisations have been largely discussed for improvements, little attention\nhas been dedicated to the relevance of MI as a clustering objective. In this\npaper, we first highlight how the maximisation of MI does not lead to\nsatisfying clusters. We identified the Kullback-Leibler divergence as the main\nreason of this behaviour. Hence, we generalise the mutual information by\nchanging its core distance, introducing the Generalised Mutual Information\n(GEMINI): a set of metrics for unsupervised neural network training. Unlike MI,\nsome GEMINIs do not require regularisations when training as they are\ngeometry-aware thanks to distances or kernels in the data space. Finally, we\nhighlight that GEMINIs can automatically select a relevant number of clusters,\na property that has been little studied in deep discriminative clustering\ncontext where the number of clusters is a priori unknown.\n","authors":["Louis Ohl","Pierre-Alexandre Mattei","Charles Bouveyron","Warith Harchaoui","Mickaël Leclercq","Arnaud Droit","Frédéric Precioso"],"pdf_url":"https://arxiv.org/pdf/2309.02858v1.pdf","comment":"Submitted for review at the IEEE Transactions on Pattern Analysis and\n Machine Intelligence. This article is an extension of an original NeurIPS\n 2022 article [arXiv:2210.06300]"},{"id":"http://arxiv.org/abs/2309.01740v2","updated":"2023-09-06T09:34:53Z","published":"2023-09-04T17:58:01Z","title":"An Empirical Analysis for Zero-Shot Multi-Label Classification on\n COVID-19 CT Scans and Uncurated Reports","summary":" The pandemic resulted in vast repositories of unstructured data, including\nradiology reports, due to increased medical examinations. Previous research on\nautomated diagnosis of COVID-19 primarily focuses on X-ray images, despite\ntheir lower precision compared to computed tomography (CT) scans. In this work,\nwe leverage unstructured data from a hospital and harness the fine-grained\ndetails offered by CT scans to perform zero-shot multi-label classification\nbased on contrastive visual language learning. In collaboration with human\nexperts, we investigate the effectiveness of multiple zero-shot models that aid\nradiologists in detecting pulmonary embolisms and identifying intricate lung\ndetails like ground glass opacities and consolidations. Our empirical analysis\nprovides an overview of the possible solutions to target such fine-grained\ntasks, so far overlooked in the medical multimodal pretraining literature. Our\ninvestigation promises future advancements in the medical image analysis\ncommunity by addressing some challenges associated with unstructured data and\nfine-grained multi-label classification.\n","authors":["Ethan Dack","Lorenzo Brigato","Matthew McMurray","Matthias Fontanellaz","Thomas Frauenfelder","Hanno Hoppe","Aristomenis Exadaktylos","Thomas Geiser","Manuela Funke-Chambour","Andreas Christe","Lukas Ebner","Stavroula Mougiakakou"],"pdf_url":"https://arxiv.org/pdf/2309.01740v2.pdf","comment":"Proceedings of the IEEE/CVF International Conference on Computer\n Vision (ICCV) Workshops 2023"},{"id":"http://arxiv.org/abs/2308.15605v3","updated":"2023-09-06T09:32:50Z","published":"2023-08-29T19:54:37Z","title":"Benchmarks for Detecting Measurement Tampering","summary":" When training powerful AI systems to perform complex tasks, it may be\nchallenging to provide training signals which are robust to optimization. One\nconcern is \\textit{measurement tampering}, where the AI system manipulates\nmultiple measurements to create the illusion of good results instead of\nachieving the desired outcome. In this work, we build four new text-based\ndatasets to evaluate measurement tampering detection techniques on large\nlanguage models. Concretely, given sets of text inputs and measurements aimed\nat determining if some outcome occurred, as well as a base model able to\naccurately predict measurements, the goal is to determine if examples where all\nmeasurements indicate the outcome occurred actually had the outcome occur, or\nif this was caused by measurement tampering. We demonstrate techniques that\noutperform simple baselines on most datasets, but don't achieve maximum\nperformance. We believe there is significant room for improvement for both\ntechniques and datasets, and we are excited for future work tackling\nmeasurement tampering.\n","authors":["Fabien Roger","Ryan Greenblatt","Max Nadeau","Buck Shlegeris","Nate Thomas"],"pdf_url":"https://arxiv.org/pdf/2308.15605v3.pdf","comment":"Edit: extended and improved appendices, fixed references"},{"id":"http://arxiv.org/abs/2309.02854v1","updated":"2023-09-06T09:31:17Z","published":"2023-09-06T09:31:17Z","title":"A Critical Review of Common Log Data Sets Used for Evaluation of\n Sequence-based Anomaly Detection Techniques","summary":" Log data store event execution patterns that correspond to underlying\nworkflows of systems or applications. While most logs are informative, log data\nalso include artifacts that indicate failures or incidents. Accordingly, log\ndata are often used to evaluate anomaly detection techniques that aim to\nautomatically disclose unexpected or otherwise relevant system behavior\npatterns. Recently, detection approaches leveraging deep learning have\nincreasingly focused on anomalies that manifest as changes of sequential\npatterns within otherwise normal event traces. Several publicly available data\nsets, such as HDFS, BGL, Thunderbird, OpenStack, and Hadoop, have since become\nstandards for evaluating these anomaly detection techniques, however, the\nappropriateness of these data sets has not been closely investigated in the\npast. In this paper we therefore analyze six publicly available log data sets\nwith focus on the manifestations of anomalies and simple techniques for their\ndetection. Our findings suggest that most anomalies are not directly related to\nsequential manifestations and that advanced detection techniques are not\nrequired to achieve high detection rates on these data sets.\n","authors":["Max Landauer","Florian Skopik","Markus Wurzenberger"],"pdf_url":"https://arxiv.org/pdf/2309.02854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09835v2","updated":"2023-09-06T09:30:53Z","published":"2023-04-19T17:37:58Z","title":"An XAI framework for robust and transparent data-driven wind turbine\n power curve models","summary":" Wind turbine power curve models translate ambient conditions into turbine\npower output. They are essential for energy yield prediction and turbine\nperformance monitoring. In recent years, increasingly complex machine learning\nmethods have become state-of-the-art for this task. Nevertheless, they\nfrequently encounter criticism due to their apparent lack of transparency,\nwhich raises concerns regarding their performance in non-stationary\nenvironments, such as those faced by wind turbines. We, therefore, introduce an\nexplainable artificial intelligence (XAI) framework to investigate and validate\nstrategies learned by data-driven power curve models from operational wind\nturbine data. With the help of simple, physics-informed baseline models it\nenables an automated evaluation of machine learning models beyond standard\nerror metrics. Alongside this novel tool, we present its efficacy for a more\ninformed model selection. We show, for instance, that learned strategies can be\nmeaningful indicators for a model's generalization ability in addition to test\nset errors, especially when only little data is available. Moreover, the\napproach facilitates an understanding of how decisions along the machine\nlearning pipeline, such as data selection, pre-processing, or training\nparameters, affect learned strategies. In a practical example, we demonstrate\nthe framework's utilisation to obtain more physically meaningful models, a\nprerequisite not only for robustness but also for insights into turbine\noperation by domain experts. The latter, we demonstrate in the context of wind\nturbine performance monitoring. Alongside this paper, we publish a Python\nimplementation of the presented framework and hope this can guide researchers\nand practitioners alike toward training, selecting and utilizing more\ntransparent and robust data-driven wind turbine power curve models.\n","authors":["Simon Letzgus","Klaus-Robert Müller"],"pdf_url":"https://arxiv.org/pdf/2304.09835v2.pdf","comment":"42 pages, 16 figures, revised version"},{"id":"http://arxiv.org/abs/2302.12598v2","updated":"2023-09-06T09:06:33Z","published":"2023-02-24T12:21:30Z","title":"Dynamic Graph Convolutional Network with Attention Fusion for Traffic\n Flow Prediction","summary":" Accurate and real-time traffic state prediction is of great practical\nimportance for urban traffic control and web mapping services. With the support\nof massive data, deep learning methods have shown their powerful capability in\ncapturing the complex spatialtemporal patterns of traffic networks. However,\nexisting approaches use pre-defined graphs and a simple set of spatial-temporal\ncomponents, making it difficult to model multi-scale spatial-temporal\ndependencies. In this paper, we propose a novel dynamic graph convolution\nnetwork with attention fusion to tackle this gap. The method first enhances the\ninteraction of temporal feature dimensions, and then it combines a dynamic\ngraph learner with GRU to jointly model synchronous spatial-temporal\ncorrelations. We also incorporate spatial-temporal attention modules to\neffectively capture longrange, multifaceted domain spatial-temporal patterns.\nWe conduct extensive experiments in four real-world traffic datasets to\ndemonstrate that our method surpasses state-of-the-art performance compared to\n18 baseline methods.\n","authors":["Xunlian Luo","Chunjiang Zhu","Detian Zhang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2302.12598v2.pdf","comment":"8 pages, 5 figure, accepted by ECAI 2023"},{"id":"http://arxiv.org/abs/2303.06053v4","updated":"2023-09-06T09:02:26Z","published":"2023-03-10T16:41:24Z","title":"TSMixer: An All-MLP Architecture for Time Series Forecasting","summary":" Real-world time-series datasets are often multivariate with complex dynamics.\nTo capture this complexity, high capacity architectures like recurrent- or\nattention-based sequential deep learning models have become popular. However,\nrecent work demonstrates that simple univariate linear models can outperform\nsuch deep learning models on several commonly used academic benchmarks.\nExtending them, in this paper, we investigate the capabilities of linear models\nfor time-series forecasting and present Time-Series Mixer (TSMixer), a novel\narchitecture designed by stacking multi-layer perceptrons (MLPs). TSMixer is\nbased on mixing operations along both the time and feature dimensions to\nextract information efficiently. On popular academic benchmarks, the\nsimple-to-implement TSMixer is comparable to specialized state-of-the-art\nmodels that leverage the inductive biases of specific benchmarks. On the\nchallenging and large scale M5 benchmark, a real-world retail dataset, TSMixer\ndemonstrates superior performance compared to the state-of-the-art\nalternatives. Our results underline the importance of efficiently utilizing\ncross-variate and auxiliary information for improving the performance of time\nseries forecasting. We present various analyses to shed light into the\ncapabilities of TSMixer. The design paradigms utilized in TSMixer are expected\nto open new horizons for deep learning-based time series forecasting. The\nimplementation is available at\nhttps://github.com/google-research/google-research/tree/master/tsmixer\n","authors":["Si-An Chen","Chun-Liang Li","Nate Yoder","Sercan O. Arik","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2303.06053v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02842v1","updated":"2023-09-06T08:59:34Z","published":"2023-09-06T08:59:34Z","title":"Random postprocessing for combinatorial Bayesian optimization","summary":" Model-based sequential approaches to discrete \"black-box\" optimization,\nincluding Bayesian optimization techniques, often access the same points\nmultiple times for a given objective function in interest, resulting in many\nsteps to find the global optimum. Here, we numerically study the effect of a\npostprocessing method on Bayesian optimization that strictly prohibits\nduplicated samples in the dataset. We find the postprocessing method\nsignificantly reduces the number of sequential steps to find the global\noptimum, especially when the acquisition function is of maximum a posterior\nestimation. Our results provide a simple but general strategy to solve the slow\nconvergence of Bayesian optimization for high-dimensional problems.\n","authors":["Keisuke Morita","Yoshihiko Nishikawa","Masayuki Ohzeki"],"pdf_url":"https://arxiv.org/pdf/2309.02842v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.18798v3","updated":"2023-09-06T08:55:51Z","published":"2023-05-30T07:19:25Z","title":"AnoOnly: Semi-Supervised Anomaly Detection with the Only Loss on\n Anomalies","summary":" Semi-supervised anomaly detection (SSAD) methods have demonstrated their\neffectiveness in enhancing unsupervised anomaly detection (UAD) by leveraging\nfew-shot but instructive abnormal instances. However, the dominance of\nhomogeneous normal data over anomalies biases the SSAD models against\neffectively perceiving anomalies. To address this issue and achieve balanced\nsupervision between heavily imbalanced normal and abnormal data, we develop a\nnovel framework called AnoOnly (Anomaly Only). Unlike existing SSAD methods\nthat resort to strict loss supervision, AnoOnly suspends it and introduces a\nform of weak supervision for normal data. This weak supervision is instantiated\nthrough the utilization of batch normalization, which implicitly performs\ncluster learning on normal data. When integrated into existing SSAD methods,\nthe proposed AnoOnly demonstrates remarkable performance enhancements across\nvarious models and datasets, achieving new state-of-the-art performance.\nAdditionally, our AnoOnly is natively robust to label noise when suffering from\ndata contamination. Our code is publicly available at\nhttps://github.com/cool-xuan/AnoOnly.\n","authors":["Yixuan Zhou","Peiyu Yang","Yi Qu","Xing Xu","Zhe Sun","Andrzej Cichocki"],"pdf_url":"https://arxiv.org/pdf/2305.18798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02836v1","updated":"2023-09-06T08:48:03Z","published":"2023-09-06T08:48:03Z","title":"BigVSAN: Enhancing GAN-based Neural Vocoders with Slicing Adversarial\n Network","summary":" Generative adversarial network (GAN)-based vocoders have been intensively\nstudied because they can synthesize high-fidelity audio waveforms faster than\nreal-time. However, it has been reported that most GANs fail to obtain the\noptimal projection for discriminating between real and fake data in the feature\nspace. In the literature, it has been demonstrated that slicing adversarial\nnetwork (SAN), an improved GAN training framework that can find the optimal\nprojection, is effective in the image generation task. In this paper, we\ninvestigate the effectiveness of SAN in the vocoding task. For this purpose, we\npropose a scheme to modify least-squares GAN, which most GAN-based vocoders\nadopt, so that their loss functions satisfy the requirements of SAN. Through\nour experiments, we demonstrate that SAN can improve the performance of\nGAN-based vocoders, including BigVGAN, with small modifications. Our code is\navailable at https://github.com/sony/bigvsan.\n","authors":["Takashi Shibuya","Yuhta Takida","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2309.02836v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2302.02662v3","updated":"2023-09-06T08:37:45Z","published":"2023-02-06T10:01:08Z","title":"Grounding Large Language Models in Interactive Environments with Online\n Reinforcement Learning","summary":" Recent works successfully leveraged Large Language Models' (LLM) abilities to\ncapture abstract knowledge about world's physics to solve decision-making\nproblems. Yet, the alignment between LLMs' knowledge and the environment can be\nwrong and limit functional competence due to lack of grounding. In this paper,\nwe study an approach (named GLAM) to achieve this alignment through functional\ngrounding: we consider an agent using an LLM as a policy that is progressively\nupdated as the agent interacts with the environment, leveraging online\nReinforcement Learning to improve its performance to solve goals. Using an\ninteractive textual environment designed to study higher-level forms of\nfunctional grounding, and a set of spatial and navigation tasks, we study\nseveral scientific questions: 1) Can LLMs boost sample efficiency for online\nlearning of various RL tasks? 2) How can it boost different forms of\ngeneralization? 3) What is the impact of online learning? We study these\nquestions by functionally grounding several variants (size, architecture) of\nFLAN-T5.\n","authors":["Thomas Carta","Clément Romac","Thomas Wolf","Sylvain Lamprier","Olivier Sigaud","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2302.02662v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16781v3","updated":"2023-09-06T08:31:38Z","published":"2023-08-31T14:59:32Z","title":"StratMed: Relevance Stratification for Low-resource Medication\n Recommendation","summary":" With the growing imbalance between limited medical resources and escalating\ndemands, AI-based clinical tasks have become paramount. Medication\nrecommendation, as a sub-domain, aims to amalgamate longitudinal patient\nhistory with medical knowledge, assisting physicians in prescribing safer and\nmore accurate medication combinations. Existing methods overlook the inherent\nlong-tail distribution in medical data, lacking balanced representation between\nhead and tail data, which leads to sub-optimal model performance. To address\nthis challenge, we introduce StratMed, a model that incorporates an innovative\nrelevance stratification mechanism. It harmonizes discrepancies in data\nlong-tail distribution and strikes a balance between the safety and accuracy of\nmedication combinations. Specifically, we first construct a pre-training method\nusing deep learning networks to obtain entity representation. After that, we\ndesign a pyramid-like data stratification method to obtain more generalized\nentity relationships by reinforcing the features of unpopular entities. Based\non this relationship, we designed two graph structures to express medication\nprecision and safety at the same level to obtain visit representations.\nFinally, the patient's historical clinical information is fitted to generate\nmedication combinations for the current health condition. Experiments on the\nMIMIC-III dataset demonstrate that our method has outperformed current\nstate-of-the-art methods in four evaluation metrics (including safety and\naccuracy).\n","authors":["Xiang Li","Shunpan Liang","Tengfei Ma","Yulei Hou"],"pdf_url":"https://arxiv.org/pdf/2308.16781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02820v1","updated":"2023-09-06T08:08:12Z","published":"2023-09-06T08:08:12Z","title":"Roulette: A Semantic Privacy-Preserving Device-Edge Collaborative\n Inference Framework for Deep Learning Classification Tasks","summary":" Deep learning classifiers are crucial in the age of artificial intelligence.\nThe device-edge-based collaborative inference has been widely adopted as an\nefficient framework for promoting its applications in IoT and 5G/6G networks.\nHowever, it suffers from accuracy degradation under non-i.i.d. data\ndistribution and privacy disclosure. For accuracy degradation, direct use of\ntransfer learning and split learning is high cost and privacy issues remain.\nFor privacy disclosure, cryptography-based approaches lead to a huge overhead.\nOther lightweight methods assume that the ground truth is non-sensitive and can\nbe exposed. But for many applications, the ground truth is the user's crucial\nprivacy-sensitive information. In this paper, we propose a framework of\nRoulette, which is a task-oriented semantic privacy-preserving collaborative\ninference framework for deep learning classifiers. More than input data, we\ntreat the ground truth of the data as private information. We develop a novel\nparadigm of split learning where the back-end DNN is frozen and the front-end\nDNN is retrained to be both a feature extractor and an encryptor. Moreover, we\nprovide a differential privacy guarantee and analyze the hardness of ground\ntruth inference attacks. To validate the proposed Roulette, we conduct\nextensive performance evaluations using realistic datasets, which demonstrate\nthat Roulette can effectively defend against various attacks and meanwhile\nachieve good model accuracy. In a situation where the non-i.i.d. is very\nsevere, Roulette improves the inference accuracy by 21\\% averaged over\nbenchmarks, while making the accuracy of discrimination attacks almost\nequivalent to random guessing.\n","authors":["Jingyi Li","Guocheng Liao","Lin Chen","Xu Chen"],"pdf_url":"https://arxiv.org/pdf/2309.02820v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02818v1","updated":"2023-09-06T08:06:15Z","published":"2023-09-06T08:06:15Z","title":"Combining Thermodynamics-based Model of the Centrifugal Compressors and\n Active Machine Learning for Enhanced Industrial Design Optimization","summary":" The design process of centrifugal compressors requires applying an\noptimization process which is computationally expensive due to complex\nanalytical equations underlying the compressor's dynamical equations. Although\nthe regression surrogate models could drastically reduce the computational cost\nof such a process, the major challenge is the scarcity of data for training the\nsurrogate model. Aiming to strategically exploit the labeled samples, we\npropose the Active-CompDesign framework in which we combine a\nthermodynamics-based compressor model (i.e., our internal software for\ncompressor design) and Gaussian Process-based surrogate model within a\ndeployable Active Learning (AL) setting. We first conduct experiments in an\noffline setting and further, extend it to an online AL framework where a\nreal-time interaction with the thermodynamics-based compressor's model allows\nthe deployment in production. ActiveCompDesign shows a significant performance\nimprovement in surrogate modeling by leveraging on uncertainty-based query\nfunction of samples within the AL framework with respect to the random\nselection of data points. Moreover, our framework in production has reduced the\ntotal computational time of compressor's design optimization to around 46%\nfaster than relying on the internal thermodynamics-based simulator, achieving\nthe same performance.\n","authors":["Shadi Ghiasi","Guido Pazzi","Concettina Del Grosso","Giovanni De Magistris","Giacomo Veneri"],"pdf_url":"https://arxiv.org/pdf/2309.02818v1.pdf","comment":"Accepted after peer-review at the 1st workshop on Synergy of\n Scientific and Machine Learning Modeling, SynS & ML ICML, Honolulu, Hawaii,\n USA. July, 2023. Copyright 2023 by the author(s)"},{"id":"http://arxiv.org/abs/2308.09444v2","updated":"2023-09-06T07:53:46Z","published":"2023-08-18T10:17:59Z","title":"An Efficient 1 Iteration Learning Algorithm for Gaussian Mixture Model\n And Gaussian Mixture Embedding For Neural Network","summary":" We propose an Gaussian Mixture Model (GMM) learning algorithm, based on our\nprevious work of GMM expansion idea. The new algorithm brings more robustness\nand simplicity than classic Expectation Maximization (EM) algorithm. It also\nimproves the accuracy and only take 1 iteration for learning. We theoretically\nproof that this new algorithm is guarantee to converge regardless the\nparameters initialisation. We compare our GMM expansion method with classic\nprobability layers in neural network leads to demonstrably better capability to\novercome data uncertainty and inverse problem. Finally, we test GMM based\ngenerator which shows a potential to build further application that able to\nutilized distribution random sampling for stochastic variation as well as\nvariation control.\n","authors":["Weiguo Lu","Xuan Wu","Deng Ding","Gangnan Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.09444v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07056v4","updated":"2023-09-06T07:53:42Z","published":"2023-06-12T12:05:54Z","title":"Kernel Random Projection Depth for Outlier Detection","summary":" This paper proposes an extension of Random Projection Depth (RPD) to cope\nwith multiple modalities and non-convexity on data clouds. In the framework of\nthe proposed method, the RPD is computed in a reproducing kernel Hilbert space.\nWith the help of kernel principal component analysis, we expect that the\nproposed method can cope with the above multiple modalities and non-convexity.\nThe experimental results demonstrate that the proposed method outperforms RPD\nand is comparable to other existing detection models on benchmark datasets\nregarding Area Under the Curves (AUCs) of Receiver Operating Characteristic\n(ROC).\n","authors":["Akira Tamamori"],"pdf_url":"https://arxiv.org/pdf/2306.07056v4.pdf","comment":"Accepted to APSIPA ASC 2023"},{"id":"http://arxiv.org/abs/2309.02805v1","updated":"2023-09-06T07:48:22Z","published":"2023-09-06T07:48:22Z","title":"Introducing Thermodynamics-Informed Symbolic Regression -- A Tool for\n Thermodynamic Equations of State Development","summary":" Thermodynamic equations of state (EOS) are essential for many industries as\nwell as in academia. Even leaving aside the expensive and extensive measurement\ncampaigns required for the data acquisition, the development of EOS is an\nintensely time-consuming process, which does often still heavily rely on expert\nknowledge and iterative fine-tuning. To improve upon and accelerate the EOS\ndevelopment process, we introduce thermodynamics-informed symbolic regression\n(TiSR), a symbolic regression (SR) tool aimed at thermodynamic EOS modeling.\nTiSR is already a capable SR tool, which was used in the research of\nhttps://doi.org/10.1007/s10765-023-03197-z. It aims to combine an SR base with\nthe extensions required to work with often strongly scattered experimental\ndata, different residual pre- and post-processing options, and additional\nfeatures required to consider thermodynamic EOS development. Although TiSR is\nnot ready for end users yet, this paper is intended to report on its current\nstate, showcase the progress, and discuss (distant and not so distant) future\ndirections. TiSR is available at https://github.com/scoop-group/TiSR and can be\ncited as https://doi.org/10.5281/zenodo.8317547.\n","authors":["Viktor Martinek","Ophelia Frotscher","Markus Richter","Roland Herzog"],"pdf_url":"https://arxiv.org/pdf/2309.02805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.03422v3","updated":"2023-09-06T07:29:31Z","published":"2021-11-05T11:50:07Z","title":"Transferable Time-Series Forecasting under Causal Conditional Shift","summary":" This paper focuses on the problem of semi-supervised domain adaptation for\ntime-series forecasting, which is underexplored in literatures, despite being\noften encountered in practice. Existing methods on time-series domain\nadaptation mainly follow the paradigm designed for the static data, which\ncannot handle domain-specific complex conditional dependencies raised by data\noffset, time lags, and variant data distributions. In order to address these\nchallenges, we analyze variational conditional dependencies in time-series data\nand find that the causal structures are usually stable among domains, and\nfurther raise the causal conditional shift assumption. Enlightened by this\nassumption, we consider the causal generation process for time-series data and\npropose an end-to-end model for the semi-supervised domain adaptation problem\non time-series forecasting. Our method can not only discover the Granger-Causal\nstructures among cross-domain data but also address the cross-domain\ntime-series forecasting problem with accurate and interpretable predicted\nresults. We further theoretically analyze the superiority of the proposed\nmethod, where the generalization error on the target domain is bounded by the\nempirical risks and by the discrepancy between the causal structures from\ndifferent domains. Experimental results on both synthetic and real data\ndemonstrate the effectiveness of our method for the semi-supervised domain\nadaptation method on time-series forecasting.\n","authors":["Zijian Li","Ruichu Cai","Tom Z. J Fu","Zhifeng Hao","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2111.03422v3.pdf","comment":"TPAMI2023 Accepted"},{"id":"http://arxiv.org/abs/2309.01778v2","updated":"2023-09-06T07:27:55Z","published":"2023-09-04T19:39:21Z","title":"CONFIDERAI: a novel CONFormal Interpretable-by-Design score function for\n Explainable and Reliable Artificial Intelligence","summary":" Everyday life is increasingly influenced by artificial intelligence, and\nthere is no question that machine learning algorithms must be designed to be\nreliable and trustworthy for everyone. Specifically, computer scientists\nconsider an artificial intelligence system safe and trustworthy if it fulfills\nfive pillars: explainability, robustness, transparency, fairness, and privacy.\nIn addition to these five, we propose a sixth fundamental aspect: conformity,\nthat is, the probabilistic assurance that the system will behave as the machine\nlearner expects. In this paper, we propose a methodology to link conformal\nprediction with explainable machine learning by defining CONFIDERAI, a new\nscore function for rule-based models that leverages both rules predictive\nability and points geometrical position within rules boundaries. We also\naddress the problem of defining regions in the feature space where conformal\nguarantees are satisfied by exploiting techniques to control the number of\nnon-conformal samples in conformal regions based on support vector data\ndescription (SVDD). The overall methodology is tested with promising results on\nbenchmark and real datasets, such as DNS tunneling detection or cardiovascular\ndisease prediction.\n","authors":["Alberto Carlevaro","Sara Narteni","Fabrizio Dabbene","Marco Muselli","Maurizio Mongelli"],"pdf_url":"https://arxiv.org/pdf/2309.01778v2.pdf","comment":"12 pages, 7 figures, 1 algorithm, international journal"},{"id":"http://arxiv.org/abs/2309.02787v1","updated":"2023-09-06T07:04:37Z","published":"2023-09-06T07:04:37Z","title":"Dynamic Encoding and Decoding of Information for Split Learning in\n Mobile-Edge Computing: Leveraging Information Bottleneck Theory","summary":" Split learning is a privacy-preserving distributed learning paradigm in which\nan ML model (e.g., a neural network) is split into two parts (i.e., an encoder\nand a decoder). The encoder shares so-called latent representation, rather than\nraw data, for model training. In mobile-edge computing, network functions (such\nas traffic forecasting) can be trained via split learning where an encoder\nresides in a user equipment (UE) and a decoder resides in the edge network.\nBased on the data processing inequality and the information bottleneck (IB)\ntheory, we present a new framework and training mechanism to enable a dynamic\nbalancing of the transmission resource consumption with the informativeness of\nthe shared latent representations, which directly impacts the predictive\nperformance. The proposed training mechanism offers an encoder-decoder neural\nnetwork architecture featuring multiple modes of complexity-relevance\ntradeoffs, enabling tunable performance. The adaptability can accommodate\nvarying real-time network conditions and application requirements, potentially\nreducing operational expenditure and enhancing network agility. As a proof of\nconcept, we apply the training mechanism to a millimeter-wave (mmWave)-enabled\nthroughput prediction problem. We also offer new insights and highlight some\nchallenges related to recurrent neural networks from the perspective of the IB\ntheory. Interestingly, we find a compression phenomenon across the temporal\ndomain of the sequential model, in addition to the compression phase that\noccurs with the number of training epochs.\n","authors":["Omar Alhussein","Moshi Wei","Arashmid Akhavain"],"pdf_url":"https://arxiv.org/pdf/2309.02787v1.pdf","comment":"Accepted to Proc. IEEE Globecom 2023"},{"id":"http://arxiv.org/abs/2206.10991v5","updated":"2023-09-06T06:58:11Z","published":"2022-06-22T11:45:36Z","title":"Understanding convolution on graphs via energies","summary":" Graph Neural Networks (GNNs) typically operate by message-passing, where the\nstate of a node is updated based on the information received from its\nneighbours. Most message-passing models act as graph convolutions, where\nfeatures are mixed by a shared, linear transformation before being propagated\nover the edges. On node-classification tasks, graph convolutions have been\nshown to suffer from two limitations: poor performance on heterophilic graphs,\nand over-smoothing. It is common belief that both phenomena occur because such\nmodels behave as low-pass filters, meaning that the Dirichlet energy of the\nfeatures decreases along the layers incurring a smoothing effect that\nultimately makes features no longer distinguishable. In this work, we\nrigorously prove that simple graph-convolutional models can actually enhance\nhigh frequencies and even lead to an asymptotic behaviour we refer to as\nover-sharpening, opposite to over-smoothing. We do so by showing that linear\ngraph convolutions with symmetric weights minimize a multi-particle energy that\ngeneralizes the Dirichlet energy; in this setting, the weight matrices induce\nedge-wise attraction (repulsion) through their positive (negative) eigenvalues,\nthereby controlling whether the features are being smoothed or sharpened. We\nalso extend the analysis to non-linear GNNs, and demonstrate that some existing\ntime-continuous GNNs are instead always dominated by the low frequencies.\nFinally, we validate our theoretical findings through ablations and real-world\nexperiments.\n","authors":["Francesco Di Giovanni","James Rowbottom","Benjamin P. Chamberlain","Thomas Markovich","Michael M. Bronstein"],"pdf_url":"https://arxiv.org/pdf/2206.10991v5.pdf","comment":"Accepted at TMLR; First two authors equal contribution; 35 pages"},{"id":"http://arxiv.org/abs/2309.02785v1","updated":"2023-09-06T06:53:45Z","published":"2023-09-06T06:53:45Z","title":"CVE-driven Attack Technique Prediction with Semantic Information\n Extraction and a Domain-specific Language Model","summary":" This paper addresses a critical challenge in cybersecurity: the gap between\nvulnerability information represented by Common Vulnerabilities and Exposures\n(CVEs) and the resulting cyberattack actions. CVEs provide insights into\nvulnerabilities, but often lack details on potential threat actions (tactics,\ntechniques, and procedures, or TTPs) within the ATT&CK framework. This gap\nhinders accurate CVE categorization and proactive countermeasure initiation.\nThe paper introduces the TTPpredictor tool, which uses innovative techniques to\nanalyze CVE descriptions and infer plausible TTP attacks resulting from CVE\nexploitation. TTPpredictor overcomes challenges posed by limited labeled data\nand semantic disparities between CVE and TTP descriptions. It initially\nextracts threat actions from unstructured cyber threat reports using Semantic\nRole Labeling (SRL) techniques. These actions, along with their contextual\nattributes, are correlated with MITRE's attack functionality classes. This\nautomated correlation facilitates the creation of labeled data, essential for\ncategorizing novel threat actions into threat functionality classes and TTPs.\nThe paper presents an empirical assessment, demonstrating TTPpredictor's\neffectiveness with accuracy rates of approximately 98% and F1-scores ranging\nfrom 95% to 98% in precise CVE classification to ATT&CK techniques.\nTTPpredictor outperforms state-of-the-art language model tools like ChatGPT.\nOverall, this paper offers a robust solution for linking CVEs to potential\nattack techniques, enhancing cybersecurity practitioners' ability to\nproactively identify and mitigate threats.\n","authors":["Ehsan Aghaei","Ehab Al-Shaer"],"pdf_url":"https://arxiv.org/pdf/2309.02785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02784v1","updated":"2023-09-06T06:51:15Z","published":"2023-09-06T06:51:15Z","title":"Norm Tweaking: High-performance Low-bit Quantization of Large Language\n Models","summary":" As the size of large language models (LLMs) continues to grow, model\ncompression without sacrificing accuracy has become a crucial challenge for\ndeployment. While some quantization methods, such as GPTQ, have made progress\nin achieving acceptable 4-bit weight-only quantization, attempts at lower bit\nquantization often result in severe performance degradation. In this paper, we\nintroduce a technique called norm tweaking, which can be used as a plugin in\ncurrent PTQ methods to achieve high precision while being cost-efficient. Our\napproach is inspired by the observation that rectifying the quantized\nactivation distribution to match its float counterpart can readily restore\naccuracy for LLMs. To achieve this, we carefully design a tweaking strategy\nthat includes calibration data generation and channel-wise distance constraint\nto update the weights of normalization layers for better generalization. We\nconduct extensive experiments on various datasets using several open-sourced\nLLMs. Our method demonstrates significant improvements in both weight-only\nquantization and joint quantization of weights and activations, surpassing\nexisting PTQ methods. On GLM-130B and OPT-66B, our method even achieves the\nsame level of accuracy at 2-bit quantization as their float ones. Our simple\nand effective approach makes it more practical for real-world applications.\n","authors":["Liang Li","Qingyuan Li","Bo Zhang","Xiangxiang Chu"],"pdf_url":"https://arxiv.org/pdf/2309.02784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02783v1","updated":"2023-09-06T06:49:31Z","published":"2023-09-06T06:49:31Z","title":"Improving diagnosis and prognosis of lung cancer using vision\n transformers: A scoping review","summary":" Vision transformer-based methods are advancing the field of medical\nartificial intelligence and cancer imaging, including lung cancer applications.\nRecently, many researchers have developed vision transformer-based AI methods\nfor lung cancer diagnosis and prognosis. This scoping review aims to identify\nthe recent developments on vision transformer-based AI methods for lung cancer\nimaging applications. It provides key insights into how vision transformers\ncomplemented the performance of AI and deep learning methods for lung cancer.\nFurthermore, the review also identifies the datasets that contributed to\nadvancing the field. Of the 314 retrieved studies, this review included 34\nstudies published from 2020 to 2022. The most commonly addressed task in these\nstudies was the classification of lung cancer types, such as lung squamous cell\ncarcinoma versus lung adenocarcinoma, and identifying benign versus malignant\npulmonary nodules. Other applications included survival prediction of lung\ncancer patients and segmentation of lungs. The studies lacked clear strategies\nfor clinical transformation. SWIN transformer was a popular choice of the\nresearchers; however, many other architectures were also reported where vision\ntransformer was combined with convolutional neural networks or UNet model. It\ncan be concluded that vision transformer-based models are increasingly in\npopularity for developing AI methods for lung cancer applications. However,\ntheir computational complexity and clinical relevance are important factors to\nbe considered for future research work. This review provides valuable insights\nfor researchers in the field of AI and healthcare to advance the\nstate-of-the-art in lung cancer diagnosis and prognosis. We provide an\ninteractive dashboard on lung-cancer.onrender.com/.\n","authors":["Hazrat Ali","Farida Mohsen","Zubair Shah"],"pdf_url":"https://arxiv.org/pdf/2309.02783v1.pdf","comment":"submitted to BMC Medical Imaging journal"},{"id":"http://arxiv.org/abs/2212.02055v3","updated":"2023-09-06T06:46:13Z","published":"2022-12-05T06:31:31Z","title":"Graph Convolutional Neural Networks with Diverse Negative Samples via\n Decomposed Determinant Point Processes","summary":" Graph convolutional networks (GCNs) have achieved great success in graph\nrepresentation learning by extracting high-level features from nodes and their\ntopology. Since GCNs generally follow a message-passing mechanism, each node\naggregates information from its first-order neighbour to update its\nrepresentation. As a result, the representations of nodes with edges between\nthem should be positively correlated and thus can be considered positive\nsamples. However, there are more non-neighbour nodes in the whole graph, which\nprovide diverse and useful information for the representation update. Two\nnon-adjacent nodes usually have different representations, which can be seen as\nnegative samples. Besides the node representations, the structural information\nof the graph is also crucial for learning. In this paper, we used\nquality-diversity decomposition in determinant point processes (DPP) to obtain\ndiverse negative samples. When defining a distribution on diverse subsets of\nall non-neighbouring nodes, we incorporate both graph structure information and\nnode representations. Since the DPP sampling process requires matrix eigenvalue\ndecomposition, we propose a new shortest-path-base method to improve\ncomputational efficiency. Finally, we incorporate the obtained negative samples\ninto the graph convolution operation. The ideas are evaluated empirically in\nexperiments on node classification tasks. These experiments show that the newly\nproposed methods not only improve the overall performance of standard\nrepresentation learning but also significantly alleviate over-smoothing\nproblems.\n","authors":["Wei Duan","Junyu Xuan","Maoying Qiao","Jie Lu"],"pdf_url":"https://arxiv.org/pdf/2212.02055v3.pdf","comment":"Accepted by IEEE TNNLS on 30-Aug-2023. arXiv admin note: text overlap\n with arXiv:2210.00728"},{"id":"http://arxiv.org/abs/2309.02771v1","updated":"2023-09-06T06:26:21Z","published":"2023-09-06T06:26:21Z","title":"On the Effects of Heterogeneous Errors on Multi-fidelity Bayesian\n Optimization","summary":" Bayesian optimization (BO) is a sequential optimization strategy that is\nincreasingly employed in a wide range of areas including materials design. In\nreal world applications, acquiring high-fidelity (HF) data through physical\nexperiments or HF simulations is the major cost component of BO. To alleviate\nthis bottleneck, multi-fidelity (MF) methods are used to forgo the sole\nreliance on the expensive HF data and reduce the sampling costs by querying\ninexpensive low-fidelity (LF) sources whose data are correlated with HF\nsamples. However, existing multi-fidelity BO (MFBO) methods operate under the\nfollowing two assumptions that rarely hold in practical applications: (1) LF\nsources provide data that are well correlated with the HF data on a global\nscale, and (2) a single random process can model the noise in the fused data.\nThese assumptions dramatically reduce the performance of MFBO when LF sources\nare only locally correlated with the HF source or when the noise variance\nvaries across the data sources. In this paper, we dispense with these incorrect\nassumptions by proposing an MF emulation method that (1) learns a noise model\nfor each data source, and (2) enables MFBO to leverage highly biased LF sources\nwhich are only locally correlated with the HF source. We illustrate the\nperformance of our method through analytical examples and engineering problems\non materials design.\n","authors":["Zahra Zanjani Foumani","Amin Yousefpour","Mehdi Shishehbor","Ramin Bostanabad"],"pdf_url":"https://arxiv.org/pdf/2309.02771v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02769v1","updated":"2023-09-06T06:22:18Z","published":"2023-09-06T06:22:18Z","title":"Unifying over-smoothing and over-squashing in graph neural networks: A\n physics informed approach and beyond","summary":" Graph Neural Networks (GNNs) have emerged as one of the leading approaches\nfor machine learning on graph-structured data. Despite their great success,\ncritical computational challenges such as over-smoothing, over-squashing, and\nlimited expressive power continue to impact the performance of GNNs. In this\nstudy, inspired from the time-reversal principle commonly utilized in classical\nand quantum physics, we reverse the time direction of the graph heat equation.\nThe resulted reversing process yields a class of high pass filtering functions\nthat enhance the sharpness of graph node features. Leveraging this concept, we\nintroduce the Multi-Scaled Heat Kernel based GNN (MHKG) by amalgamating diverse\nfiltering functions' effects on node features. To explore more flexible\nfiltering conditions, we further generalize MHKG into a model termed G-MHKG and\nthoroughly show the roles of each element in controlling over-smoothing,\nover-squashing and expressive power. Notably, we illustrate that all\naforementioned issues can be characterized and analyzed via the properties of\nthe filtering functions, and uncover a trade-off between over-smoothing and\nover-squashing: enhancing node feature sharpness will make model suffer more\nfrom over-squashing, and vice versa. Furthermore, we manipulate the time again\nto show how G-MHKG can handle both two issues under mild conditions. Our\nconclusive experiments highlight the effectiveness of proposed models. It\nsurpasses several GNN baseline models in performance across graph datasets\ncharacterized by both homophily and heterophily.\n","authors":["Zhiqi Shao","Dai Shi","Andi Han","Yi Guo","Qibin Zhao","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2309.02769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02762v1","updated":"2023-09-06T06:20:12Z","published":"2023-09-06T06:20:12Z","title":"Towards Unsupervised Graph Completion Learning on Graphs with Features\n and Structure Missing","summary":" In recent years, graph neural networks (GNN) have achieved significant\ndevelopments in a variety of graph analytical tasks. Nevertheless, GNN's\nsuperior performance will suffer from serious damage when the collected node\nfeatures or structure relationships are partially missing owning to numerous\nunpredictable factors. Recently emerged graph completion learning (GCL) has\nreceived increasing attention, which aims to reconstruct the missing node\nfeatures or structure relationships under the guidance of a specifically\nsupervised task. Although these proposed GCL methods have made great success,\nthey still exist the following problems: the reliance on labels, the bias of\nthe reconstructed node features and structure relationships. Besides, the\ngeneralization ability of the existing GCL still faces a huge challenge when\nboth collected node features and structure relationships are partially missing\nat the same time. To solve the above issues, we propose a more general GCL\nframework with the aid of self-supervised learning for improving the task\nperformance of the existing GNN variants on graphs with features and structure\nmissing, termed unsupervised GCL (UGCL). Specifically, to avoid the mismatch\nbetween missing node features and structure during the message-passing process\nof GNN, we separate the feature reconstruction and structure reconstruction and\ndesign its personalized model in turn. Then, a dual contrastive loss on the\nstructure level and feature level is introduced to maximize the mutual\ninformation of node representations from feature reconstructing and structure\nreconstructing paths for providing more supervision signals. Finally, the\nreconstructed node features and structure can be applied to the downstream node\nclassification task. Extensive experiments on eight datasets, three GNN\nvariants and five missing rates demonstrate the effectiveness of our proposed\nmethod.\n","authors":["Sichao Fu","Qinmu Peng","Yang He","Baokun Du","Xinge You"],"pdf_url":"https://arxiv.org/pdf/2309.02762v1.pdf","comment":"Accepted by 23rd IEEE International Conference on Data Mining (ICDM\n 2023)"},{"id":"http://arxiv.org/abs/2309.02752v1","updated":"2023-09-06T06:17:35Z","published":"2023-09-06T06:17:35Z","title":"SWAP: Exploiting Second-Ranked Logits for Adversarial Attacks on Time\n Series","summary":" Time series classification (TSC) has emerged as a critical task in various\ndomains, and deep neural models have shown superior performance in TSC tasks.\nHowever, these models are vulnerable to adversarial attacks, where subtle\nperturbations can significantly impact the prediction results. Existing\nadversarial methods often suffer from over-parameterization or random logit\nperturbation, hindering their effectiveness. Additionally, increasing the\nattack success rate (ASR) typically involves generating more noise, making the\nattack more easily detectable. To address these limitations, we propose SWAP, a\nnovel attacking method for TSC models. SWAP focuses on enhancing the confidence\nof the second-ranked logits while minimizing the manipulation of other logits.\nThis is achieved by minimizing the Kullback-Leibler divergence between the\ntarget logit distribution and the predictive logit distribution. Experimental\nresults demonstrate that SWAP achieves state-of-the-art performance, with an\nASR exceeding 50% and an 18% increase compared to existing methods.\n","authors":["Chang George Dong","Liangwei Nathan Zheng","Weitong Chen","Wei Emma Zhang","Lin Yue"],"pdf_url":"https://arxiv.org/pdf/2309.02752v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.00855v2","updated":"2023-09-06T06:13:58Z","published":"2023-09-02T08:01:32Z","title":"DoRA: Domain-Based Self-Supervised Learning Framework for Low-Resource\n Real Estate Appraisal","summary":" The marketplace system connecting demands and supplies has been explored to\ndevelop unbiased decision-making in valuing properties. Real estate appraisal\nserves as one of the high-cost property valuation tasks for financial\ninstitutions since it requires domain experts to appraise the estimation based\non the corresponding knowledge and the judgment of the market. Existing\nautomated valuation models reducing the subjectivity of domain experts require\na large number of transactions for effective evaluation, which is predominantly\nlimited to not only the labeling efforts of transactions but also the\ngeneralizability of new developing and rural areas. To learn representations\nfrom unlabeled real estate sets, existing self-supervised learning (SSL) for\ntabular data neglects various important features, and fails to incorporate\ndomain knowledge. In this paper, we propose DoRA, a Domain-based\nself-supervised learning framework for low-resource Real estate Appraisal. DoRA\nis pre-trained with an intra-sample geographic prediction as the pretext task\nbased on the metadata of the real estate for equipping the real estate\nrepresentations with prior domain knowledge. Furthermore, inter-sample\ncontrastive learning is employed to generalize the representations to be robust\nfor limited transactions of downstream tasks. Our benchmark results on three\nproperty types of real-world transactions show that DoRA significantly\noutperforms the SSL baselines for tabular data, the graph-based methods, and\nthe supervised approaches in the few-shot scenarios by at least 7.6% for MAPE,\n11.59% for MAE, and 3.34% for HR10%. We expect DoRA to be useful to other\nfinancial practitioners with similar marketplace applications who need general\nmodels for properties that are newly built and have limited records. The source\ncode is available at https://github.com/wwweiwei/DoRA.\n","authors":["Wei-Wei Du","Wei-Yao Wang","Wen-Chih Peng"],"pdf_url":"https://arxiv.org/pdf/2309.00855v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2302.03213v2","updated":"2023-09-06T05:53:44Z","published":"2023-02-07T02:51:10Z","title":"LUT-NN: Empower Efficient Neural Network Inference with Centroid\n Learning and Table Lookup","summary":" On-device Deep Neural Network (DNN) inference consumes significant computing\nresources and development efforts. To alleviate that, we propose LUT-NN, the\nfirst system to empower inference by table lookup, to reduce inference cost.\nLUT-NN learns the typical features for each operator, named centroid, and\nprecompute the results for these centroids to save in lookup tables. During\ninference, the results of the closest centroids with the inputs can be read\ndirectly from the table, as the approximated outputs without computations.\nLUT-NN integrates two major novel techniques: (1) differentiable centroid\nlearning through backpropagation, which adapts three levels of approximation to\nminimize the accuracy impact by centroids; (2) table lookup inference\nexecution, which comprehensively considers different levels of parallelism,\nmemory access reduction, and dedicated hardware units for optimal performance.\nLUT-NN is evaluated on multiple real tasks, covering image and speech\nrecognition, and nature language processing. Compared to related work, LUT-NN\nimproves accuracy by 66% to 92%, achieving similar level with the original\nmodels. LUT-NN reduces the cost at all dimensions, including FLOPs ($\\leq$\n16x), model size ($\\leq$ 7x), latency ($\\leq$ 6.8x), memory ($\\leq$ 6.5x), and\npower ($\\leq$ 41.7%).\n","authors":["Xiaohu Tang","Yang Wang","Ting Cao","Li Lyna Zhang","Qi Chen","Deng Cai","Yunxin Liu","Mao Yang"],"pdf_url":"https://arxiv.org/pdf/2302.03213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02724v1","updated":"2023-09-06T05:18:43Z","published":"2023-09-06T05:18:43Z","title":"Offensive Hebrew Corpus and Detection using BERT","summary":" Offensive language detection has been well studied in many languages, but it\nis lagging behind in low-resource languages, such as Hebrew. In this paper, we\npresent a new offensive language corpus in Hebrew. A total of 15,881 tweets\nwere retrieved from Twitter. Each was labeled with one or more of five classes\n(abusive, hate, violence, pornographic, or none offensive) by Arabic-Hebrew\nbilingual speakers. The annotation process was challenging as each annotator is\nexpected to be familiar with the Israeli culture, politics, and practices to\nunderstand the context of each tweet. We fine-tuned two Hebrew BERT models,\nHeBERT and AlephBERT, using our proposed dataset and another published dataset.\nWe observed that our data boosts HeBERT performance by 2% when combined with\nD_OLaH. Fine-tuning AlephBERT on our data and testing on D_OLaH yields 69%\naccuracy, while fine-tuning on D_OLaH and testing on our data yields 57%\naccuracy, which may be an indication to the generalizability our data offers.\nOur dataset and fine-tuned models are available on GitHub and Huggingface.\n","authors":["Nagham Hamad","Mustafa Jarrar","Mohammad Khalilia","Nadim Nashif"],"pdf_url":"https://arxiv.org/pdf/2309.02724v1.pdf","comment":"8 pages, 1 figure, The 20th ACS/IEEE International Conference on\n Computer Systems and Applications (AICCSA)"},{"id":"http://arxiv.org/abs/2303.07647v4","updated":"2023-09-06T05:06:22Z","published":"2023-03-14T06:15:17Z","title":"Recent Advances and Applications of Machine Learning in Experimental\n Solid Mechanics: A Review","summary":" For many decades, experimental solid mechanics has played a crucial role in\ncharacterizing and understanding the mechanical properties of natural and novel\nmaterials. Recent advances in machine learning (ML) provide new opportunities\nfor the field, including experimental design, data analysis, uncertainty\nquantification, and inverse problems. As the number of papers published in\nrecent years in this emerging field is exploding, it is timely to conduct a\ncomprehensive and up-to-date review of recent ML applications in experimental\nsolid mechanics. Here, we first provide an overview of common ML algorithms and\nterminologies that are pertinent to this review, with emphasis placed on\nphysics-informed and physics-based ML methods. Then, we provide thorough\ncoverage of recent ML applications in traditional and emerging areas of\nexperimental mechanics, including fracture mechanics, biomechanics, nano- and\nmicro-mechanics, architected materials, and 2D material. Finally, we highlight\nsome current challenges of applying ML to multi-modality and multi-fidelity\nexperimental datasets and propose several future research directions. This\nreview aims to provide valuable insights into the use of ML methods as well as\na variety of examples for researchers in solid mechanics to integrate into\ntheir experiments.\n","authors":["Hanxun Jin","Enrui Zhang","Horacio D. Espinosa"],"pdf_url":"https://arxiv.org/pdf/2303.07647v4.pdf","comment":"93 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.02712v1","updated":"2023-09-06T04:50:39Z","published":"2023-09-06T04:50:39Z","title":"Unveiling the frontiers of deep learning: innovations shaping diverse\n domains","summary":" Deep learning (DL) enables the development of computer models that are\ncapable of learning, visualizing, optimizing, refining, and predicting data. In\nrecent years, DL has been applied in a range of fields, including audio-visual\ndata processing, agriculture, transportation prediction, natural language,\nbiomedicine, disaster management, bioinformatics, drug design, genomics, face\nrecognition, and ecology. To explore the current state of deep learning, it is\nnecessary to investigate the latest developments and applications of deep\nlearning in these disciplines. However, the literature is lacking in exploring\nthe applications of deep learning in all potential sectors. This paper thus\nextensively investigates the potential applications of deep learning across all\nmajor fields of study as well as the associated benefits and challenges. As\nevidenced in the literature, DL exhibits accuracy in prediction and analysis,\nmakes it a powerful computational tool, and has the ability to articulate\nitself and optimize, making it effective in processing data with no prior\ntraining. Given its independence from training data, deep learning necessitates\nmassive amounts of data for effective analysis and processing, much like data\nvolume. To handle the challenge of compiling huge amounts of medical,\nscientific, healthcare, and environmental data for use in deep learning, gated\narchitectures like LSTMs and GRUs can be utilized. For multimodal learning,\nshared neurons in the neural network for all activities and specialized neurons\nfor particular tasks are necessary.\n","authors":["Shams Forruque Ahmed","Md. Sakib Bin Alam","Maliha Kabir","Shaila Afrin","Sabiha Jannat Rafa","Aanushka Mehjabin","Amir H. Gandomi"],"pdf_url":"https://arxiv.org/pdf/2309.02712v1.pdf","comment":"64 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.02711v1","updated":"2023-09-06T04:47:46Z","published":"2023-09-06T04:47:46Z","title":"Addressing Imperfect Symmetry: a Novel Symmetry-Learning Actor-Critic\n Extension","summary":" Symmetry, a fundamental concept to understand our environment, often\noversimplifies reality from a mathematical perspective. Humans are a prime\nexample, deviating from perfect symmetry in terms of appearance and cognitive\nbiases (e.g. having a dominant hand). Nevertheless, our brain can easily\novercome these imperfections and efficiently adapt to symmetrical tasks. The\ndriving motivation behind this work lies in capturing this ability through\nreinforcement learning. To this end, we introduce Adaptive Symmetry Learning\n(ASL) $\\unicode{x2013}$ a model-minimization actor-critic extension that\naddresses incomplete or inexact symmetry descriptions by adapting itself during\nthe learning process. ASL consists of a symmetry fitting component and a\nmodular loss function that enforces a common symmetric relation across all\nstates while adapting to the learned policy. The performance of ASL is compared\nto existing symmetry-enhanced methods in a case study involving a four-legged\nant model for multidirectional locomotion tasks. The results demonstrate that\nASL is capable of recovering from large perturbations and generalizing\nknowledge to hidden symmetric states. It achieves comparable or better\nperformance than alternative methods in most scenarios, making it a valuable\napproach for leveraging model symmetry while compensating for inherent\nperturbations.\n","authors":["Miguel Abreu","Luis Paulo Reis","Nuno Lau"],"pdf_url":"https://arxiv.org/pdf/2309.02711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02710v1","updated":"2023-09-06T04:46:01Z","published":"2023-09-06T04:46:01Z","title":"Improved Outlier Robust Seeding for k-means","summary":" The $k$-means is a popular clustering objective, although it is inherently\nnon-robust and sensitive to outliers. Its popular seeding or initialization\ncalled $k$-means++ uses $D^{2}$ sampling and comes with a provable $O(\\log k)$\napproximation guarantee \\cite{AV2007}. However, in the presence of adversarial\nnoise or outliers, $D^{2}$ sampling is more likely to pick centers from distant\noutliers instead of inlier clusters, and therefore its approximation guarantees\n\\textit{w.r.t.} $k$-means solution on inliers, does not hold.\n Assuming that the outliers constitute a constant fraction of the given data,\nwe propose a simple variant in the $D^2$ sampling distribution, which makes it\nrobust to the outliers. Our algorithm runs in $O(ndk)$ time, outputs $O(k)$\nclusters, discards marginally more points than the optimal number of outliers,\nand comes with a provable $O(1)$ approximation guarantee.\n Our algorithm can also be modified to output exactly $k$ clusters instead of\n$O(k)$ clusters, while keeping its running time linear in $n$ and $d$. This is\nan improvement over previous results for robust $k$-means based on LP\nrelaxation and rounding \\cite{Charikar}, \\cite{KrishnaswamyLS18} and\n\\textit{robust $k$-means++} \\cite{DeshpandeKP20}. Our empirical results show\nthe advantage of our algorithm over $k$-means++~\\cite{AV2007}, uniform random\nseeding, greedy sampling for $k$ means~\\cite{tkmeanspp}, and robust\n$k$-means++~\\cite{DeshpandeKP20}, on standard real-world and synthetic data\nsets used in previous work. Our proposal is easily amenable to scalable,\nfaster, parallel implementations of $k$-means++ \\cite{Bahmani,BachemL017} and\nis of independent interest for coreset constructions in the presence of\noutliers \\cite{feldman2007ptas,langberg2010universal,feldman2011unified}.\n","authors":["Amit Deshpande","Rameshwar Pratap"],"pdf_url":"https://arxiv.org/pdf/2309.02710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02705v1","updated":"2023-09-06T04:37:20Z","published":"2023-09-06T04:37:20Z","title":"Certifying LLM Safety against Adversarial Prompting","summary":" Large language models (LLMs) released for public use incorporate guardrails\nto ensure their output is safe, often referred to as \"model alignment.\" An\naligned language model should decline a user's request to produce harmful\ncontent. However, such safety measures are vulnerable to adversarial prompts,\nwhich contain maliciously designed token sequences to circumvent the model's\nsafety guards and cause it to produce harmful content. In this work, we\nintroduce erase-and-check, the first framework to defend against adversarial\nprompts with verifiable safety guarantees. We erase tokens individually and\ninspect the resulting subsequences using a safety filter. Our procedure labels\nthe input prompt as harmful if any subsequences or the input prompt are\ndetected as harmful by the filter. This guarantees that any adversarial\nmodification of a harmful prompt up to a certain size is also labeled harmful.\nWe defend against three attack modes: i) adversarial suffix, which appends an\nadversarial sequence at the end of the prompt; ii) adversarial insertion, where\nthe adversarial sequence is inserted anywhere in the middle of the prompt; and\niii) adversarial infusion, where adversarial tokens are inserted at arbitrary\npositions in the prompt, not necessarily as a contiguous block. Empirical\nresults demonstrate that our technique obtains strong certified safety\nguarantees on harmful prompts while maintaining good performance on safe\nprompts. For example, against adversarial suffixes of length 20, it certifiably\ndetects 93% of the harmful prompts and labels 94% of the safe prompts as safe\nusing the open source language model Llama 2 as the safety filter.\n","authors":["Aounon Kumar","Chirag Agarwal","Suraj Srinivas","Soheil Feizi","Hima Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2309.02705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01392v2","updated":"2023-09-06T04:28:39Z","published":"2023-09-04T06:44:46Z","title":"Differentiable Bayesian Structure Learning with Acyclicity Assurance","summary":" Score-based approaches in the structure learning task are thriving because of\ntheir scalability. Continuous relaxation has been the key reason for this\nadvancement. Despite achieving promising outcomes, most of these methods are\nstill struggling to ensure that the graphs generated from the latent space are\nacyclic by minimizing a defined score. There has also been another trend of\npermutation-based approaches, which concern the search for the topological\nordering of the variables in the directed acyclic graph in order to limit the\nsearch space of the graph. In this study, we propose an alternative approach\nfor strictly constraining the acyclicty of the graphs with an integration of\nthe knowledge from the topological orderings. Our approach can reduce inference\ncomplexity while ensuring the structures of the generated graphs to be acyclic.\nOur empirical experiments with simulated and real-world data show that our\napproach can outperform related Bayesian score-based approaches.\n","authors":["Quang-Duy Tran","Phuoc Nguyen","Bao Duong","Thin Nguyen"],"pdf_url":"https://arxiv.org/pdf/2309.01392v2.pdf","comment":"Accepted as a regular paper (9.37%) at the 23rd IEEE International\n Conference on Data Mining (ICDM 2023)"},{"id":"http://arxiv.org/abs/2309.02064v2","updated":"2023-09-06T04:27:58Z","published":"2023-09-05T09:06:34Z","title":"MvFS: Multi-view Feature Selection for Recommender System","summary":" Feature selection, which is a technique to select key features in recommender\nsystems, has received increasing research attention. Recently, Adaptive Feature\nSelection (AdaFS) has shown remarkable performance by adaptively selecting\nfeatures for each data instance, considering that the importance of a given\nfeature field can vary significantly across data. However, this method still\nhas limitations in that its selection process could be easily biased to major\nfeatures that frequently occur. To address these problems, we propose\nMulti-view Feature Selection (MvFS), which selects informative features for\neach instance more effectively. Most importantly, MvFS employs a multi-view\nnetwork consisting of multiple sub-networks, each of which learns to measure\nthe feature importance of a part of data with different feature patterns. By\ndoing so, MvFS mitigates the bias problem towards dominant patterns and\npromotes a more balanced feature selection process. Moreover, MvFS adopts an\neffective importance score modeling strategy which is applied independently to\neach field without incurring dependency among features. Experimental results on\nreal-world datasets demonstrate the effectiveness of MvFS compared to\nstate-of-the-art baselines.\n","authors":["Youngjune Lee","Yeongjong Jeong","Keunchan Park","SeongKu Kang"],"pdf_url":"https://arxiv.org/pdf/2309.02064v2.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2304.04027v4","updated":"2023-09-06T04:13:42Z","published":"2023-04-08T14:40:35Z","title":"Estimating 3D Dental Structures using Simulated Panoramic Radiographs\n and Neural Ray Tracing","summary":" Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality\nfor dental examination. However, PX only provides a flattened 2D image, lacking\nin a 3D view of the oral structure. In this paper, we propose a framework to\nestimate 3D oral structures from real-world PX. Our framework tackles full 3D\nreconstruction for varying subjects (patients) where each reconstruction is\nbased only on a single panoramic image. We create an intermediate\nrepresentation called simulated PX (SimPX) from 3D Cone-beam computed\ntomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and\nrotational principles of PX imaging. SimPX aims at not only truthfully\nsimulating PX, but also facilitates the reverting process back to 3D data. We\npropose a novel neural model based on ray tracing which exploits both global\nand local input features to convert SimPX to 3D output. At inference, a real PX\nimage is translated to a SimPX-style image with semantic regularization, and\nthe translated image is processed by generation module to produce high-quality\noutputs. Experiments show that our method outperforms prior state-of-the-art in\nreconstruction tasks both quantitatively and qualitatively. Unlike prior\nmethods, Our method does not require any prior information such as the shape of\ndental arches, nor the matched PX-CBCT dataset for training, which is difficult\nto obtain in clinical practice.\n","authors":["Sihwa Park","Seongjun Kim","Doeyoung Kwon","Yohan Jang","In-Seok Song","Seungjun Baek"],"pdf_url":"https://arxiv.org/pdf/2304.04027v4.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2211.04218v2","updated":"2023-09-06T04:05:54Z","published":"2022-11-08T13:04:56Z","title":"Clustered Federated Learning based on Nonconvex Pairwise Fusion","summary":" This study investigates clustered federated learning (FL), one of the\nformulations of FL with non-i.i.d. data, where the devices are partitioned into\nclusters and each cluster optimally fits its data with a localized model. We\npropose a clustered FL framework that incorporates a nonconvex penalty to\npairwise differences of parameters. This framework can automatically identify\ncluster structures without a priori knowledge of the number of clusters and the\nset of devices in each cluster. To implement the proposed framework, we\nintroduce a novel clustered FL method called Fusion Penalized Federated\nClustering (FPFC). Building upon the standard alternating direction method of\nmultipliers (ADMM), FPFC is implemented in parallel, updates only a subset of\ndevices at each communication round, and allows for variable workload per\ndevice. These strategies significantly reduce the communication cost while\nensuring privacy, making it practical for FL. We also propose a new warmup\nstrategy for hyperparameter tuning in FL settings and explore the asynchronous\nvariant of FPFC (asyncFPFC). Theoretical analysis provides convergence\nguarantees for FPFC with general nonconvex losses and establishes the\nstatistical convergence rate under a linear model with squared loss. Extensive\nexperiments demonstrate the advantages of FPFC over existing methods, including\nrobustness and generalization capability.\n","authors":["Xue Yu","Ziyi Liu","Wu Wang","Yifan Sun"],"pdf_url":"https://arxiv.org/pdf/2211.04218v2.pdf","comment":"46 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02084v2","updated":"2023-09-06T03:53:38Z","published":"2023-09-05T09:42:15Z","title":"An Efficient Approach to Unsupervised Out-of-Distribution Detection with\n Variational Autoencoders","summary":" This paper is concerned with deep generative models (DGMs) for unsupervised\nout-of-distribution (OOD) detection. In particular, we focus on vanilla\nVariational Autoencoders (VAE) that use a standard normal prior distribution\nfor the latent variables. These models have a smaller model size, enabling\nfaster training and inference, making them well-suited for resource-limited\napplications compared to more complex DGMs. We propose a novel OOD score called\nError Reduction (ER) specifically designed for vanilla VAE. ER incorporate the\nidea of reconstructing image inputs from their lossy counterparts and takes\ninto account the Kolmogorov complexity of the images. Experimental results on\ndiverse datasets demonstrate the superiority of our approach over baseline\nmethods. Our code is available at: https://github.com/ZJLAB-AMMI/VAE4OOD.\n","authors":["Zezhen Zeng","Bin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02084v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2309.02685v1","updated":"2023-09-06T03:42:20Z","published":"2023-09-06T03:42:20Z","title":"Diffusion-EDFs: Bi-equivariant Denoising Generative Modeling on SE(3)\n for Visual Robotic Manipulation","summary":" Recent studies have verified that equivariant methods can significantly\nimprove the data efficiency, generalizability, and robustness in robot\nlearning. Meanwhile, denoising diffusion-based generative modeling has recently\ngained significant attention as a promising approach for robotic manipulation\nlearning from demonstrations with stochastic behaviors. In this paper, we\npresent Diffusion-EDFs, a novel approach that incorporates spatial\nroto-translation equivariance, i.e., SE(3)-equivariance to diffusion generative\nmodeling. By integrating SE(3)-equivariance into our model architectures, we\ndemonstrate that our proposed method exhibits remarkable data efficiency,\nrequiring only 5 to 10 task demonstrations for effective end-to-end training.\nFurthermore, our approach showcases superior generalizability compared to\nprevious diffusion-based manipulation methods.\n","authors":["Hyunwoo Ryu","Jiwoo Kim","Junwoo Chang","Hyun Seok Ahn","Joohwan Seo","Taehan Kim","Jongeun Choi","Roberto Horowitz"],"pdf_url":"https://arxiv.org/pdf/2309.02685v1.pdf","comment":"27 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.01069v2","updated":"2023-09-06T03:29:18Z","published":"2023-09-03T03:54:43Z","title":"Separable Hamiltonian Neural Networks","summary":" The modelling of dynamical systems from discrete observations is a challenge\nfaced by modern scientific and engineering data systems. Hamiltonian systems\nare one such fundamental and ubiquitous class of dynamical systems. Hamiltonian\nneural networks are state-of-the-art models that unsupervised-ly regress the\nHamiltonian of a dynamical system from discrete observations of its vector\nfield under the learning bias of Hamilton's equations. Yet Hamiltonian dynamics\nare often complicated, especially in higher dimensions where the state space of\nthe Hamiltonian system is large relative to the number of samples. A recently\ndiscovered remedy to alleviate the complexity between state variables in the\nstate space is to leverage the additive separability of the Hamiltonian system\nand embed that additive separability into the Hamiltonian neural network.\nFollowing the nomenclature of physics-informed machine learning, we propose\nthree separable Hamiltonian neural networks. These models embed additive\nseparability within Hamiltonian neural networks. The first model uses additive\nseparability to quadratically scale the amount of data for training Hamiltonian\nneural networks. The second model embeds additive separability within the loss\nfunction of the Hamiltonian neural network. The third model embeds additive\nseparability through the architecture of the Hamiltonian neural network using\nconjoined multilayer perceptions. We empirically compare the three models\nagainst state-of-the-art Hamiltonian neural networks, and demonstrate that the\nseparable Hamiltonian neural networks, which alleviate complexity between the\nstate variables, are more effective at regressing the Hamiltonian and its\nvector field.\n","authors":["Zi-Yu Khoo","Jonathan Sze Choong Low","Stéphane Bressan"],"pdf_url":"https://arxiv.org/pdf/2309.01069v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2309.02671v1","updated":"2023-09-06T02:40:33Z","published":"2023-09-06T02:40:33Z","title":"RLSynC: Offline-Online Reinforcement Learning for Synthon Completion","summary":" Retrosynthesis is the process of determining the set of reactant molecules\nthat can react to form a desired product. Semi-template-based retrosynthesis\nmethods, which imitate the reverse logic of synthesis reactions, first predict\nthe reaction centers in the products, and then complete the resulting synthons\nback into reactants. These methods enable necessary interpretability and high\npractical utility to inform synthesis planning. We develop a new offline-online\nreinforcement learning method RLSynC for synthon completion in\nsemi-template-based methods. RLSynC assigns one agent to each synthon, all of\nwhich complete the synthons by conducting actions step by step in a\nsynchronized fashion. RLSynC learns the policy from both offline training\nepisodes and online interactions which allow RLSynC to explore new reaction\nspaces. RLSynC uses a forward synthesis model to evaluate the likelihood of the\npredicted reactants in synthesizing a product, and thus guides the action\nsearch. We compare RLSynC with the state-of-the-art retrosynthesis methods. Our\nexperimental results demonstrate that RLSynC can outperform these methods with\nimprovement as high as 14.9% on synthon completion, and 14.0% on\nretrosynthesis, highlighting its potential in synthesis planning.\n","authors":["Frazier N. Baker","Ziqi Chen","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2309.02671v1.pdf","comment":"11 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2301.12811v3","updated":"2023-09-06T02:36:58Z","published":"2023-01-30T12:03:44Z","title":"SAN: Inducing Metrizability of GAN with Discriminative Normalized Linear\n Layer","summary":" Generative adversarial networks (GANs) learn a target probability\ndistribution by optimizing a generator and a discriminator with minimax\nobjectives. This paper addresses the question of whether such optimization\nactually provides the generator with gradients that make its distribution close\nto the target distribution. We derive metrizable conditions, sufficient\nconditions for the discriminator to serve as the distance between the\ndistributions by connecting the GAN formulation with the concept of sliced\noptimal transport. Furthermore, by leveraging these theoretical results, we\npropose a novel GAN training scheme, called slicing adversarial network (SAN).\nWith only simple modifications, a broad class of existing GANs can be converted\nto SANs. Experiments on synthetic and image datasets support our theoretical\nresults and the SAN's effectiveness as compared to usual GANs. Furthermore, we\nalso apply SAN to StyleGAN-XL, which leads to state-of-the-art FID score\namongst GANs for class conditional generation on ImageNet 256$\\times$256.\n","authors":["Yuhta Takida","Masaaki Imaizumi","Takashi Shibuya","Chieh-Hsin Lai","Toshimitsu Uesaka","Naoki Murata","Yuki Mitsufuji"],"pdf_url":"https://arxiv.org/pdf/2301.12811v3.pdf","comment":"24 pages with 13 figures"},{"id":"http://arxiv.org/abs/2309.02669v1","updated":"2023-09-06T02:35:46Z","published":"2023-09-06T02:35:46Z","title":"Marketing Budget Allocation with Offline Constrained Deep Reinforcement\n Learning","summary":" We study the budget allocation problem in online marketing campaigns that\nutilize previously collected offline data. We first discuss the long-term\neffect of optimizing marketing budget allocation decisions in the offline\nsetting. To overcome the challenge, we propose a novel game-theoretic offline\nvalue-based reinforcement learning method using mixed policies. The proposed\nmethod reduces the need to store infinitely many policies in previous methods\nto only constantly many policies, which achieves nearly optimal policy\nefficiency, making it practical and favorable for industrial usage. We further\nshow that this method is guaranteed to converge to the optimal policy, which\ncannot be achieved by previous value-based reinforcement learning methods for\nmarketing budget allocation. Our experiments on a large-scale marketing\ncampaign with tens-of-millions users and more than one billion budget verify\nthe theoretical results and show that the proposed method outperforms various\nbaseline methods. The proposed method has been successfully deployed to serve\nall the traffic of this marketing campaign.\n","authors":["Tianchi Cai","Jiyan Jiang","Wenpeng Zhang","Shiji Zhou","Xierui Song","Li Yu","Lihong Gu","Xiaodong Zeng","Jinjie Gu","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.02669v1.pdf","comment":"WSDM 23, Best Paper Candidate"},{"id":"http://arxiv.org/abs/2201.03968v2","updated":"2023-09-06T02:30:02Z","published":"2022-01-10T00:27:43Z","title":"Optimal and Differentially Private Data Acquisition: Central and Local\n Mechanisms","summary":" We consider a platform's problem of collecting data from privacy sensitive\nusers to estimate an underlying parameter of interest. We formulate this\nquestion as a Bayesian-optimal mechanism design problem, in which an individual\ncan share her (verifiable) data in exchange for a monetary reward or services,\nbut at the same time has a (private) heterogeneous privacy cost which we\nquantify using differential privacy. We consider two popular differential\nprivacy settings for providing privacy guarantees for the users: central and\nlocal. In both settings, we establish minimax lower bounds for the estimation\nerror and derive (near) optimal estimators for given heterogeneous privacy loss\nlevels for users. Building on this characterization, we pose the mechanism\ndesign problem as the optimal selection of an estimator and payments that will\nelicit truthful reporting of users' privacy sensitivities. Under a regularity\ncondition on the distribution of privacy sensitivities we develop efficient\nalgorithmic mechanisms to solve this problem in both privacy settings. Our\nmechanism in the central setting can be implemented in time $\\mathcal{O}(n \\log\nn)$ where $n$ is the number of users and our mechanism in the local setting\nadmits a Polynomial Time Approximation Scheme (PTAS).\n","authors":["Alireza Fallah","Ali Makhdoumi","Azarakhsh Malekian","Asuman Ozdaglar"],"pdf_url":"https://arxiv.org/pdf/2201.03968v2.pdf","comment":"To appear in the Operations Research journal. The abstract appeared\n in the Proceedings of the 23rd ACM Conference on Economics and Computation\n (EC 2022)"},{"id":"http://arxiv.org/abs/2208.05318v2","updated":"2023-09-06T02:29:01Z","published":"2022-08-10T12:55:56Z","title":"Generative Action Description Prompts for Skeleton-based Action\n Recognition","summary":" Skeleton-based action recognition has recently received considerable\nattention. Current approaches to skeleton-based action recognition are\ntypically formulated as one-hot classification tasks and do not fully exploit\nthe semantic relations between actions. For example, \"make victory sign\" and\n\"thumb up\" are two actions of hand gestures, whose major difference lies in the\nmovement of hands. This information is agnostic from the categorical one-hot\nencoding of action classes but could be unveiled from the action description.\nTherefore, utilizing action description in training could potentially benefit\nrepresentation learning. In this work, we propose a Generative\nAction-description Prompts (GAP) approach for skeleton-based action\nrecognition. More specifically, we employ a pre-trained large-scale language\nmodel as the knowledge engine to automatically generate text descriptions for\nbody parts movements of actions, and propose a multi-modal training scheme by\nutilizing the text encoder to generate feature vectors for different body parts\nand supervise the skeleton encoder for action representation learning.\nExperiments show that our proposed GAP method achieves noticeable improvements\nover various baseline models without extra computation cost at inference. GAP\nachieves new state-of-the-arts on popular skeleton-based action recognition\nbenchmarks, including NTU RGB+D, NTU RGB+D 120 and NW-UCLA. The source code is\navailable at https://github.com/MartinXM/GAP.\n","authors":["Wangmeng Xiang","Chao Li","Yuxuan Zhou","Biao Wang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.05318v2.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2309.01167v2","updated":"2023-09-06T02:15:41Z","published":"2023-09-03T13:14:46Z","title":"Symbolically integrating tensor networks over various random tensors --\n the second version of Python RTNI","summary":" We are upgrading the Python-version of RTNI, which symbolically integrates\ntensor networks over the Haar-distributed unitary matrices. Now, PyRTNI2 can\ntreat the Haar-distributed orthogonal matrices and the real and complex normal\nGaussian tensors as well. Moreover, it can export tensor networks in the format\nof TensorNetwork so that one can make further calculations with concrete\ntensors, even for low dimensions, where the Weingarten functions differ from\nthe ones for high dimensions. The tutorial notebooks are found at GitHub:\nhttps://github.com/MotohisaFukuda/PyRTNI2. In this paper, we explain maths\nbehind the program and show what kind of tensor network calculations can be\nmade with it. For the former, we interpret the element-wise moment calculus of\nthe above random matrices and tensors in terms of tensor network diagrams, and\nargue that the view is natural, relating delta functions in the calculus to\nedges in tensor network diagrams.\n","authors":["Motohisa Fukuda"],"pdf_url":"https://arxiv.org/pdf/2309.01167v2.pdf","comment":"PyRTNI2 is at https://github.com/MotohisaFukuda/PyRTNI2"},{"id":"http://arxiv.org/abs/2210.12860v3","updated":"2023-09-06T02:02:19Z","published":"2022-10-23T21:24:37Z","title":"Explicit Second-Order Min-Max Optimization Methods with Optimal\n Convergence Guarantee","summary":" We propose and analyze exact and inexact regularized Newton-type methods for\nfinding a global saddle point of \\emph{convex-concave} unconstrained min-max\noptimization problems. Compared to first-order methods, our understanding of\nsecond-order methods for min-max optimization is relatively limited, as\nobtaining global rates of convergence with second-order information is much\nmore involved. In this paper, we examine how second-order information can be\nused to speed up extra-gradient methods, even under inexactness. Specifically,\nwe show that the proposed algorithms generate iterates that remain within a\nbounded set and the averaged iterates converge to an $\\epsilon$-saddle point\nwithin $O(\\epsilon^{-2/3})$ iterations in terms of a restricted gap function.\nOur algorithms match the theoretically established lower bound in this context\nand our analysis provides a simple and intuitive convergence analysis for\nsecond-order methods without any boundedness requirements. Finally, we present\na series of numerical experiments on synthetic and real data that demonstrate\nthe efficiency of the proposed algorithms.\n","authors":["Tianyi Lin","Panayotis Mertikopoulos","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2210.12860v3.pdf","comment":"Improve the paper significantly; 35 pages, 9 figures"},{"id":"http://arxiv.org/abs/2303.00973v2","updated":"2023-09-06T01:48:56Z","published":"2023-03-02T05:10:57Z","title":"Image Labels Are All You Need for Coarse Seagrass Segmentation","summary":" Seagrass meadows serve as critical carbon sinks, but estimating the amount of\ncarbon they store requires knowledge of the seagrass species present.\nUnderwater and surface vehicles equipped with machine learning algorithms can\nhelp to accurately estimate the composition and extent of seagrass meadows at\nscale. However, previous approaches for seagrass detection and classification\nhave required supervision from patch-level labels. In this paper, we reframe\nseagrass classification as a weakly supervised coarse segmentation problem\nwhere image-level labels are used during training (25 times fewer labels\ncompared to patch-level labeling) and patch-level outputs are obtained at\ninference time. To this end, we introduce SeaFeats, an architecture that uses\nunsupervised contrastive pre-training and feature similarity, and SeaCLIP, a\nmodel that showcases the effectiveness of large language models as a\nsupervisory signal in domain-specific applications. We demonstrate that an\nensemble of SeaFeats and SeaCLIP leads to highly robust performance. Our method\noutperforms previous approaches that require patch-level labels on the\nmulti-species 'DeepSeagrass' dataset by 6.8% (absolute) for the class-weighted\nF1 score, and by 12.1% (absolute) for the seagrass presence/absence F1 score on\nthe 'Global Wetlands' dataset. We also present two case studies for real-world\ndeployment: outlier detection on the Global Wetlands dataset, and application\nof our method on imagery collected by the FloatyBoat autonomous surface\nvehicle.\n","authors":["Scarlett Raine","Ross Marchant","Brano Kusy","Frederic Maire","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2303.00973v2.pdf","comment":"10 pages, 4 figures, additional 3 pages of supplementary material"},{"id":"http://arxiv.org/abs/2309.02651v1","updated":"2023-09-06T01:25:30Z","published":"2023-09-06T01:25:30Z","title":"Contrastive Learning as Kernel Approximation","summary":" In standard supervised machine learning, it is necessary to provide a label\nfor every input in the data. While raw data in many application domains is\neasily obtainable on the Internet, manual labelling of this data is\nprohibitively expensive. To circumvent this issue, contrastive learning methods\nproduce low-dimensional vector representations (also called features) of\nhigh-dimensional inputs on large unlabelled datasets. This is done by training\nwith a contrastive loss function, which enforces that similar inputs have high\ninner product and dissimilar inputs have low inner product in the feature\nspace. Rather than annotating each input individually, it suffices to define a\nmeans of sampling pairs of similar and dissimilar inputs. Contrastive features\ncan then be fed as inputs to supervised learning systems on much smaller\nlabelled datasets to obtain high accuracy on end tasks of interest.\n The goal of this thesis is to provide an overview of the current theoretical\nunderstanding of contrastive learning, specifically as it pertains to the\nminimizers of contrastive loss functions and their relationship to prior\nmethods for learning features from unlabelled data. We highlight popular\ncontrastive loss functions whose minimizers implicitly approximate a positive\nsemidefinite (PSD) kernel. The latter is a well-studied object in functional\nanalysis and learning theory that formalizes a notion of similarity between\nelements of a space. PSD kernels provide an implicit definition of features\nthrough the theory of reproducing kernel Hilbert spaces.\n","authors":["Konstantinos Christopher Tsiolis"],"pdf_url":"https://arxiv.org/pdf/2309.02651v1.pdf","comment":"Master's (M.Sc.) Thesis"},{"id":"http://arxiv.org/abs/2006.02482v4","updated":"2023-09-06T01:04:40Z","published":"2020-06-03T19:02:34Z","title":"Explaining the Behavior of Black-Box Prediction Algorithms with Causal\n Learning","summary":" Causal approaches to post-hoc explainability for black-box prediction models\n(e.g., deep neural networks trained on image pixel data) have become\nincreasingly popular. However, existing approaches have two important\nshortcomings: (i) the \"explanatory units\" are micro-level inputs into the\nrelevant prediction model, e.g., image pixels, rather than interpretable\nmacro-level features that are more useful for understanding how to possibly\nchange the algorithm's behavior, and (ii) existing approaches assume there\nexists no unmeasured confounding between features and target model predictions,\nwhich fails to hold when the explanatory units are macro-level variables. Our\nfocus is on the important setting where the analyst has no access to the inner\nworkings of the target prediction algorithm, rather only the ability to query\nthe output of the model in response to a particular input. To provide causal\nexplanations in such a setting, we propose to learn causal graphical\nrepresentations that allow for arbitrary unmeasured confounding among features.\nWe demonstrate the resulting graph can differentiate between interpretable\nfeatures that causally influence model predictions versus those that are merely\nassociated with model predictions due to confounding. Our approach is motivated\nby a counterfactual theory of causal explanation wherein good explanations\npoint to factors that are \"difference-makers\" in an interventionist sense.\n","authors":["Numair Sani","Daniel Malinsky","Ilya Shpitser"],"pdf_url":"https://arxiv.org/pdf/2006.02482v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02641v1","updated":"2023-09-06T01:03:14Z","published":"2023-09-06T01:03:14Z","title":"TFBEST: Dual-Aspect Transformer with Learnable Positional Encoding for\n Failure Prediction","summary":" Hard Disk Drive (HDD) failures in datacenters are costly - from catastrophic\ndata loss to a question of goodwill, stakeholders want to avoid it like the\nplague. An important tool in proactively monitoring against HDD failure is\ntimely estimation of the Remaining Useful Life (RUL). To this end, the\nSelf-Monitoring, Analysis and Reporting Technology employed within HDDs\n(S.M.A.R.T.) provide critical logs for long-term maintenance of the security\nand dependability of these essential data storage devices. Data-driven\npredictive models in the past have used these S.M.A.R.T. logs and CNN/RNN based\narchitectures heavily. However, they have suffered significantly in providing a\nconfidence interval around the predicted RUL values as well as in processing\nvery long sequences of logs. In addition, some of these approaches, such as\nthose based on LSTMs, are inherently slow to train and have tedious feature\nengineering overheads. To overcome these challenges, in this work we propose a\nnovel transformer architecture - a Temporal-fusion Bi-encoder Self-attention\nTransformer (TFBEST) for predicting failures in hard-drives. It is an\nencoder-decoder based deep learning technique that enhances the context gained\nfrom understanding health statistics sequences and predicts a sequence of the\nnumber of days remaining before a disk potentially fails. In this paper, we\nalso provide a novel confidence margin statistic that can help manufacturers\nreplace a hard-drive within a time frame. Experiments on Seagate HDD data show\nthat our method significantly outperforms the state-of-the-art RUL prediction\nmethods during testing over the exhaustive 10-year data from Backblaze\n(2013-present). Although validated on HDD failure prediction, the TFBEST\narchitecture is well-suited for other prognostics applications and may be\nadapted for allied regression problems.\n","authors":["Rohan Mohapatra","Saptarshi Sengupta"],"pdf_url":"https://arxiv.org/pdf/2309.02641v1.pdf","comment":"9 pages, 6 figures, 2 tables"},{"id":"http://arxiv.org/abs/2309.02640v1","updated":"2023-09-06T00:59:27Z","published":"2023-09-06T00:59:27Z","title":"Epi-Curriculum: Episodic Curriculum Learning for Low-Resource Domain\n Adaptation in Neural Machine Translation","summary":" Neural Machine Translation (NMT) models have become successful, but their\nperformance remains poor when translating on new domains with a limited number\nof data. In this paper, we present a novel approach Epi-Curriculum to address\nlow-resource domain adaptation (DA), which contains a new episodic training\nframework along with denoised curriculum learning. Our episodic training\nframework enhances the model's robustness to domain shift by episodically\nexposing the encoder/decoder to an inexperienced decoder/encoder. The denoised\ncurriculum learning filters the noised data and further improves the model's\nadaptability by gradually guiding the learning process from easy to more\ndifficult tasks. Experiments on English-German and English-Romanian translation\nshow that: (i) Epi-Curriculum improves both model's robustness and adaptability\nin seen and unseen domains; (ii) Our episodic training framework enhances the\nencoder and decoder's robustness to domain shift.\n","authors":["Keyu Chen","Di Zhuang","Mingchen Li","J. Morris Chang"],"pdf_url":"https://arxiv.org/pdf/2309.02640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02636v1","updated":"2023-09-06T00:56:24Z","published":"2023-09-06T00:56:24Z","title":"Multiclass Alignment of Confidence and Certainty for Network Calibration","summary":" Deep neural networks (DNNs) have made great strides in pushing the\nstate-of-the-art in several challenging domains. Recent studies reveal that\nthey are prone to making overconfident predictions. This greatly reduces the\noverall trust in model predictions, especially in safety-critical applications.\nEarly work in improving model calibration employs post-processing techniques\nwhich rely on limited parameters and require a hold-out set. Some recent\ntrain-time calibration methods, which involve all model parameters, can\noutperform the postprocessing methods. To this end, we propose a new train-time\ncalibration method, which features a simple, plug-and-play auxiliary loss known\nas multi-class alignment of predictive mean confidence and predictive certainty\n(MACC). It is based on the observation that a model miscalibration is directly\nrelated to its predictive certainty, so a higher gap between the mean\nconfidence and certainty amounts to a poor calibration both for in-distribution\nand out-of-distribution predictions. Armed with this insight, our proposed loss\nexplicitly encourages a confident (or underconfident) model to also provide a\nlow (or high) spread in the presoftmax distribution. Extensive experiments on\nten challenging datasets, covering in-domain, out-domain, non-visual\nrecognition and medical image classification scenarios, show that our method\nachieves state-of-the-art calibration performance for both in-domain and\nout-domain predictions. Our code and models will be publicly released.\n","authors":["Vinith Kugathasan","Muhammad Haris Khan"],"pdf_url":"https://arxiv.org/pdf/2309.02636v1.pdf","comment":"Accepted at GCPR 2023"},{"id":"http://arxiv.org/abs/2206.06817v2","updated":"2023-09-06T00:50:03Z","published":"2022-06-14T13:11:22Z","title":"A novel physics-informed machine learning strategy to accelerate\n unsteady heat and mass transfer simulations","summary":" Despite the rapid advancements in the performance of central processing units\n(CPUs), the simulation of unsteady heat and mass transfer is computationally\nvery costly, particularly in large domains. While a big wave of machine\nlearning (ML) has propagated in accelerating computational fluid dynamics (CFD)\nstudies, recent research has revealed that it is unrealistic to completely\nsuppress the error increase as the gap between the training and prediction\ntimes increases in single training approach. In this study, we propose a\nresidual-based physics-informed transfer learning (RePIT) strategy to\naccelerate unsteady heat and mass transfer simulations using ML-CFD cross\ncomputation. Our hypothesis is that long-term CFD simulations become feasible\nif continuous ML-CFD cross computation is periodically carried out to not only\nreduce increased residuals but also update network parameters with the latest\nCFD time series data (transfer learning approach). The cross point of ML-CFD is\ndetermined using a method similar to residual monitoring methods of first\nprinciple solvers (physics-informed manner). The feasibility of the proposed\nstrategy was evaluated based on natural convection simulation and compared to\nthe single training approach. In the single training approach, a residual scale\nchange occurred around 100 timesteps leading to predicted time series\nexhibiting non-physical pattern as well as a large difference from the ground\ntruth. Conversely, it was confirmed that the RePIT strategy maintained the\ncontinuity residual within the set range and showed good agreement with the\nground truth for all variables and locations. The simulation was accelerated by\n1.9 times, including the parameter-updating time. In conclusion, this universal\nstrategy has the potential to significantly reduce the computational cost of\nCFD simulations while maintaining high accuracy.\n","authors":["Joongoo Jeon","Juhyeong Lee","Ricardo Vinuesa","Sung Joong Kim"],"pdf_url":"https://arxiv.org/pdf/2206.06817v2.pdf","comment":"30 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.02632v1","updated":"2023-09-06T00:44:29Z","published":"2023-09-06T00:44:29Z","title":"Deep Reinforcement Learning from Hierarchical Weak Preference Feedback","summary":" Reward design is a fundamental, yet challenging aspect of practical\nreinforcement learning (RL). For simple tasks, researchers typically handcraft\nthe reward function, e.g., using a linear combination of several reward\nfactors. However, such reward engineering is subject to approximation bias,\nincurs large tuning cost, and often cannot provide the granularity required for\ncomplex tasks. To avoid these difficulties, researchers have turned to\nreinforcement learning from human feedback (RLHF), which learns a reward\nfunction from human preferences between pairs of trajectory sequences. By\nleveraging preference-based reward modeling, RLHF learns complex rewards that\nare well aligned with human preferences, allowing RL to tackle increasingly\ndifficult problems. Unfortunately, the applicability of RLHF is limited due to\nthe high cost and difficulty of obtaining human preference data. In light of\nthis cost, we investigate learning reward functions for complex tasks with less\nhuman effort; simply by ranking the importance of the reward factors. More\nspecifically, we propose a new RL framework -- HERON, which compares\ntrajectories using a hierarchical decision tree induced by the given ranking.\nThese comparisons are used to train a preference-based reward model, which is\nthen used for policy learning. We find that our framework can not only train\nhigh performing agents on a variety of difficult tasks, but also provide\nadditional benefits such as improved sample efficiency and robustness. Our code\nis available at https://github.com/abukharin3/HERON.\n","authors":["Alexander Bukharin","Yixiao Li","Pengcheng He","Weizhu Chen","Tuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.02632v1.pdf","comment":"28 Pages, 15 figures"},{"id":"http://arxiv.org/abs/2306.03933v3","updated":"2023-09-06T00:36:42Z","published":"2023-06-06T18:01:03Z","title":"High-dimensional and Permutation Invariant Anomaly Detection","summary":" Methods for anomaly detection of new physics processes are often limited to\nlow-dimensional spaces due to the difficulty of learning high-dimensional\nprobability densities. Particularly at the constituent level, incorporating\ndesirable properties such as permutation invariance and variable-length inputs\nbecomes difficult within popular density estimation methods. In this work, we\nintroduce a permutation-invariant density estimator for particle physics data\nbased on diffusion models, specifically designed to handle variable-length\ninputs. We demonstrate the efficacy of our methodology by utilizing the learned\ndensity as a permutation-invariant anomaly detection score, effectively\nidentifying jets with low likelihood under the background-only hypothesis. To\nvalidate our density estimation method, we investigate the ratio of learned\ndensities and compare to those obtained by a supervised classification\nalgorithm.\n","authors":["Vinicius Mikuni","Benjamin Nachman"],"pdf_url":"https://arxiv.org/pdf/2306.03933v3.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2210.03505v3","updated":"2023-09-06T00:35:30Z","published":"2022-10-07T12:50:34Z","title":"Sample-Efficient Personalization: Modeling User Parameters as Low Rank\n Plus Sparse Components","summary":" Personalization of machine learning (ML) predictions for individual\nusers/domains/enterprises is critical for practical recommendation systems.\nStandard personalization approaches involve learning a user/domain specific\nembedding that is fed into a fixed global model which can be limiting. On the\nother hand, personalizing/fine-tuning model itself for each user/domain --\na.k.a meta-learning -- has high storage/infrastructure cost. Moreover, rigorous\ntheoretical studies of scalable personalization approaches have been very\nlimited. To address the above issues, we propose a novel meta-learning style\napproach that models network weights as a sum of low-rank and sparse\ncomponents. This captures common information from multiple individuals/users\ntogether in the low-rank part while sparse part captures user-specific\nidiosyncrasies. We then study the framework in the linear setting, where the\nproblem reduces to that of estimating the sum of a rank-$r$ and a $k$-column\nsparse matrix using a small number of linear measurements. We propose a\ncomputationally efficient alternating minimization method with iterative hard\nthresholding -- AMHT-LRS -- to learn the low-rank and sparse part.\nTheoretically, for the realizable Gaussian data setting, we show that AMHT-LRS\nsolves the problem efficiently with nearly optimal sample complexity. Finally,\na significant challenge in personalization is ensuring privacy of each user's\nsensitive data. We alleviate this problem by proposing a differentially private\nvariant of our method that also is equipped with strong generalization\nguarantees.\n","authors":["Soumyabrata Pal","Prateek Varshney","Prateek Jain","Abhradeep Guha Thakurta","Gagan Madan","Gaurav Aggarwal","Pradeep Shenoy","Gaurav Srivastava"],"pdf_url":"https://arxiv.org/pdf/2210.03505v3.pdf","comment":"104 pages, 7 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2203.09096v4","updated":"2023-09-06T00:30:04Z","published":"2022-03-17T05:42:00Z","title":"DeepAD: A Robust Deep Learning Model of Alzheimer's Disease Progression\n for Real-World Clinical Applications","summary":" The ability to predict the future trajectory of a patient is a key step\ntoward the development of therapeutics for complex diseases such as Alzheimer's\ndisease (AD). However, most machine learning approaches developed for\nprediction of disease progression are either single-task or single-modality\nmodels, which can not be directly adopted to our setting involving multi-task\nlearning with high dimensional images. Moreover, most of those approaches are\ntrained on a single dataset (i.e. cohort), which can not be generalized to\nother cohorts. We propose a novel multimodal multi-task deep learning model to\npredict AD progression by analyzing longitudinal clinical and neuroimaging data\nfrom multiple cohorts. Our proposed model integrates high dimensional MRI\nfeatures from a 3D convolutional neural network with other data modalities,\nincluding clinical and demographic information, to predict the future\ntrajectory of patients. Our model employs an adversarial loss to alleviate the\nstudy-specific imaging bias, in particular the inter-study domain shifts. In\naddition, a Sharpness-Aware Minimization (SAM) optimization technique is\napplied to further improve model generalization. The proposed model is trained\nand tested on various datasets in order to evaluate and validate the results.\nOur results showed that 1) our model yields significant improvement over the\nbaseline models, and 2) models using extracted neuroimaging features from 3D\nconvolutional neural network outperform the same models when applied to\nMRI-derived volumetric features.\n","authors":["Somaye Hashemifar","Claudia Iriondo","Evan Casey","Mohsen Hejrati"],"pdf_url":"https://arxiv.org/pdf/2203.09096v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04014v2","updated":"2023-09-06T23:13:07Z","published":"2023-08-08T03:18:18Z","title":"Continual Pre-Training of Large Language Models: How to (re)warm your\n model?","summary":" Large language models (LLMs) are routinely pre-trained on billions of tokens,\nonly to restart the process over again once new data becomes available. A much\ncheaper and more efficient solution would be to enable the continual\npre-training of these models, i.e. updating pre-trained models with new data\ninstead of re-training them from scratch. However, the distribution shift\ninduced by novel data typically results in degraded performance on past data.\nTaking a step towards efficient continual pre-training, in this work, we\nexamine the effect of different warm-up strategies. Our hypothesis is that the\nlearning rate must be re-increased to improve compute efficiency when training\non a new dataset. We study the warmup phase of models pre-trained on the Pile\n(upstream data, 300B tokens) as we continue to pre-train on SlimPajama\n(downstream data, 297B tokens), following a linear warmup and cosine decay\nschedule. We conduct all experiments on the Pythia 410M language model\narchitecture and evaluate performance through validation perplexity. We\nexperiment with different pre-training checkpoints, various maximum learning\nrates, and various warmup lengths. Our results show that while rewarming models\nfirst increases the loss on upstream and downstream data, in the longer run it\nimproves the downstream performance, outperforming models trained from\nscratch$\\unicode{x2013}$even for a large downstream dataset.\n","authors":["Kshitij Gupta","Benjamin Thérien","Adam Ibrahim","Mats L. Richter","Quentin Anthony","Eugene Belilovsky","Irina Rish","Timothée Lesort"],"pdf_url":"https://arxiv.org/pdf/2308.04014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.00115v5","updated":"2023-09-06T23:01:25Z","published":"2021-09-30T22:54:46Z","title":"Comparing Sequential Forecasters","summary":" Consider two forecasters, each making a single prediction for a sequence of\nevents over time. We ask a relatively basic question: how might we compare\nthese forecasters, either online or post-hoc, while avoiding unverifiable\nassumptions on how the forecasts and outcomes were generated? In this paper, we\npresent a rigorous answer to this question by designing novel sequential\ninference procedures for estimating the time-varying difference in forecast\nscores. To do this, we employ confidence sequences (CS), which are sequences of\nconfidence intervals that can be continuously monitored and are valid at\narbitrary data-dependent stopping times (\"anytime-valid\"). The widths of our\nCSs are adaptive to the underlying variance of the score differences.\nUnderlying their construction is a game-theoretic statistical framework, in\nwhich we further identify e-processes and p-processes for sequentially testing\na weak null hypothesis -- whether one forecaster outperforms another on average\n(rather than always). Our methods do not make distributional assumptions on the\nforecasts or outcomes; our main theorems apply to any bounded scores, and we\nlater provide alternative methods for unbounded scores. We empirically validate\nour approaches by comparing real-world baseball and weather forecasters.\n","authors":["Yo Joong Choe","Aaditya Ramdas"],"pdf_url":"https://arxiv.org/pdf/2110.00115v5.pdf","comment":"Accepted to Operations Research. Code and data sources available at\n https://github.com/yjchoe/ComparingForecasters"},{"id":"http://arxiv.org/abs/2306.08754v3","updated":"2023-09-06T22:56:03Z","published":"2023-06-14T21:26:31Z","title":"ClimSim: An open large-scale dataset for training high-resolution\n physics emulators in hybrid multi-scale climate simulators","summary":" Modern climate projections lack adequate spatial and temporal resolution due\nto computational constraints. A consequence is inaccurate and imprecise\npredictions of critical processes such as storms. Hybrid methods that combine\nphysics with machine learning (ML) have introduced a new generation of higher\nfidelity climate simulators that can sidestep Moore's Law by outsourcing\ncompute-hungry, short, high-resolution simulations to ML emulators. However,\nthis hybrid ML-physics simulation approach requires domain-specific treatment\nand has been inaccessible to ML experts because of lack of training data and\nrelevant, easy-to-use workflows. We present ClimSim, the largest-ever dataset\ndesigned for hybrid ML-physics research. It comprises multi-scale climate\nsimulations, developed by a consortium of climate scientists and ML\nresearchers. It consists of 5.7 billion pairs of multivariate input and output\nvectors that isolate the influence of locally-nested, high-resolution,\nhigh-fidelity physics on a host climate simulator's macro-scale physical state.\n The dataset is global in coverage, spans multiple years at high sampling\nfrequency, and is designed such that resulting emulators are compatible with\ndownstream coupling into operational climate simulators. We implement a range\nof deterministic and stochastic regression baselines to highlight the ML\nchallenges and their scoring. The data\n(https://huggingface.co/datasets/LEAP/ClimSim_high-res,\nhttps://huggingface.co/datasets/LEAP/ClimSim_low-res, and\nhttps://huggingface.co/datasets/LEAP/ClimSim_low-res_aqua-planet) and code\n(https://leap-stc.github.io/ClimSim) are released openly to support the\ndevelopment of hybrid ML-physics and high-fidelity climate simulations for the\nbenefit of science and society.\n","authors":["Sungduk Yu","Walter M. Hannah","Liran Peng","Jerry Lin","Mohamed Aziz Bhouri","Ritwik Gupta","Björn Lütjens","Justus C. Will","Gunnar Behrens","Julius J. M. Busecke","Nora Loose","Charles Stern","Tom Beucler","Bryce E. Harrop","Benjamin R. Hilman","Andrea M. Jenney","Savannah L. Ferretti","Nana Liu","Anima Anandkumar","Noah D. Brenowitz","Veronika Eyring","Nicholas Geneva","Pierre Gentine","Stephan Mandt","Jaideep Pathak","Akshay Subramaniam","Carl Vondrick","Rose Yu","Laure Zanna","Tian Zheng","Ryan P. Abernathey","Fiaz Ahmed","David C. Bader","Pierre Baldi","Elizabeth A. Barnes","Christopher S. Bretherton","Peter M. Caldwell","Wayne Chuang","Yilun Han","Yu Huang","Fernando Iglesias-Suarez","Sanket Jantre","Karthik Kashinath","Marat Khairoutdinov","Thorsten Kurth","Nicholas J. Lutsko","Po-Lun Ma","Griffin Mooers","J. David Neelin","David A. Randall","Sara Shamekh","Mark A. Taylor","Nathan M. Urban","Janni Yuval","Guang J. Zhang","Michael S. Pritchard"],"pdf_url":"https://arxiv.org/pdf/2306.08754v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01311v3","updated":"2023-09-06T22:22:50Z","published":"2023-04-03T19:28:43Z","title":"Knowledge Graphs in Practice: Characterizing their Users, Challenges,\n and Visualization Opportunities","summary":" This study presents insights from interviews with nineteen Knowledge Graph\n(KG) practitioners who work in both enterprise and academic settings on a wide\nvariety of use cases. Through this study, we identify critical challenges\nexperienced by KG practitioners when creating, exploring, and analyzing KGs\nthat could be alleviated through visualization design. Our findings reveal\nthree major personas among KG practitioners - KG Builders, Analysts, and\nConsumers - each of whom have their own distinct expertise and needs. We\ndiscover that KG Builders would benefit from schema enforcers, while KG\nAnalysts need customizable query builders that provide interim query results.\nFor KG Consumers, we identify a lack of efficacy for node-link diagrams, and\nthe need for tailored domain-specific visualizations to promote KG adoption and\ncomprehension. Lastly, we find that implementing KGs effectively in practice\nrequires both technical and social solutions that are not addressed with\ncurrent tools, technologies, and collaborative workflows. From the analysis of\nour interviews, we distill several visualization research directions to improve\nKG usability, including knowledge cards that balance digestibility and\ndiscoverability, timeline views to track temporal changes, interfaces that\nsupport organic discovery, and semantic explanations for AI and machine\nlearning predictions.\n","authors":["Harry Li","Gabriel Appleby","Camelia Daniela Brumar","Remco Chang","Ashley Suh"],"pdf_url":"https://arxiv.org/pdf/2304.01311v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03386v1","updated":"2023-09-06T22:16:58Z","published":"2023-09-06T22:16:58Z","title":"Community-Based Hierarchical Positive-Unlabeled (PU) Model Fusion for\n Chronic Disease Prediction","summary":" Positive-Unlabeled (PU) Learning is a challenge presented by binary\nclassification problems where there is an abundance of unlabeled data along\nwith a small number of positive data instances, which can be used to address\nchronic disease screening problem. State-of-the-art PU learning methods have\nresulted in the development of various risk estimators, yet they neglect the\ndifferences among distinct populations. To address this issue, we present a\nnovel Positive-Unlabeled Learning Tree (PUtree) algorithm. PUtree is designed\nto take into account communities such as different age or income brackets, in\ntasks of chronic disease prediction. We propose a novel approach for binary\ndecision-making, which hierarchically builds community-based PU models and then\naggregates their deliverables. Our method can explicate each PU model on the\ntree for the optimized non-leaf PU node splitting. Furthermore, a mask-recovery\ndata augmentation strategy enables sufficient training of the model in\nindividual communities. Additionally, the proposed approach includes an\nadversarial PU risk estimator to capture hierarchical PU-relationships, and a\nmodel fusion network that integrates data from each tree path, resulting in\nrobust binary classification results. We demonstrate the superior performance\nof PUtree as well as its variants on two benchmarks and a new\ndiabetes-prediction dataset.\n","authors":["Yang Wu","Xurui Li","Xuhong Zhang","Yangyang Kang","Changlong Sun","Xiaozhong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.03386v1.pdf","comment":"Accepted by CIKM 2023 as a long paper"},{"id":"http://arxiv.org/abs/2212.06566v2","updated":"2023-09-06T21:57:15Z","published":"2022-12-10T04:05:54Z","title":"How to select an objective function using information theory","summary":" In machine learning or scientific computing, model performance is measured\nwith an objective function. But why choose one objective over another?\nInformation theory gives one answer: To maximize the information in the model,\nselect the most likely objective function or whichever represents the error in\nthe fewest bits. To evaluate different objectives, transform them into\nlikelihood functions. As likelihoods, their relative magnitudes represent how\nmuch we should prefer one objective versus another, and the log of their\nmagnitude represents the expected uncertainty of the model.\n","authors":["Timothy O. Hodson","Thomas M. Over","Tyler J. Smith","Lucy M. Marshall"],"pdf_url":"https://arxiv.org/pdf/2212.06566v2.pdf","comment":"8 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2103.12591v5","updated":"2023-09-06T21:24:10Z","published":"2021-03-23T14:46:09Z","title":"BoXHED2.0: Scalable boosting of dynamic survival analysis","summary":" Modern applications of survival analysis increasingly involve time-dependent\ncovariates. The Python package BoXHED2.0 is a tree-boosted hazard estimator\nthat is fully nonparametric, and is applicable to survival settings far more\ngeneral than right-censoring, including recurring events and competing risks.\nBoXHED2.0 is also scalable to the point of being on the same order of speed as\nparametric boosted survival models, in part because its core is written in C++\nand it also supports the use of GPUs and multicore CPUs. BoXHED2.0 is available\nfrom PyPI and also from www.github.com/BoXHED.\n","authors":["Arash Pakbin","Xiaochen Wang","Bobak J. Mortazavi","Donald K. K. Lee"],"pdf_url":"https://arxiv.org/pdf/2103.12591v5.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2206.02231v3","updated":"2023-09-06T21:13:28Z","published":"2022-06-05T17:58:02Z","title":"Models of human preference for learning reward functions","summary":" The utility of reinforcement learning is limited by the alignment of reward\nfunctions with the interests of human stakeholders. One promising method for\nalignment is to learn the reward function from human-generated preferences\nbetween pairs of trajectory segments, a type of reinforcement learning from\nhuman feedback (RLHF). These human preferences are typically assumed to be\ninformed solely by partial return, the sum of rewards along each segment. We\nfind this assumption to be flawed and propose modeling human preferences\ninstead as informed by each segment's regret, a measure of a segment's\ndeviation from optimal decision-making. Given infinitely many preferences\ngenerated according to regret, we prove that we can identify a reward function\nequivalent to the reward function that generated those preferences, and we\nprove that the previous partial return model lacks this identifiability\nproperty in multiple contexts. We empirically show that our proposed regret\npreference model outperforms the partial return preference model with finite\ntraining data in otherwise the same setting. Additionally, we find that our\nproposed regret preference model better predicts real human preferences and\nalso learns reward functions from these preferences that lead to policies that\nare better human-aligned. Overall, this work establishes that the choice of\npreference model is impactful, and our proposed regret preference model\nprovides an improvement upon a core assumption of recent research. We have open\nsourced our experimental code, the human preferences dataset we gathered, and\nour training and preference elicitation interfaces for gathering a such a\ndataset.\n","authors":["W. Bradley Knox","Stephane Hatgis-Kessell","Serena Booth","Scott Niekum","Peter Stone","Alessandro Allievi"],"pdf_url":"https://arxiv.org/pdf/2206.02231v3.pdf","comment":"16 pages (40 pages with references and appendix), 23 figures"},{"id":"http://arxiv.org/abs/2309.03360v1","updated":"2023-09-06T21:04:53Z","published":"2023-09-06T21:04:53Z","title":"ViewMix: Augmentation for Robust Representation in Self-Supervised\n Learning","summary":" Joint Embedding Architecture-based self-supervised learning methods have\nattributed the composition of data augmentations as a crucial factor for their\nstrong representation learning capabilities. While regional dropout strategies\nhave proven to guide models to focus on lesser indicative parts of the objects\nin supervised methods, it hasn't been adopted by self-supervised methods for\ngenerating positive pairs. This is because the regional dropout methods are not\nsuitable for the input sampling process of the self-supervised methodology.\nWhereas dropping informative pixels from the positive pairs can result in\ninefficient training, replacing patches of a specific object with a different\none can steer the model from maximizing the agreement between different\npositive pairs. Moreover, joint embedding representation learning methods have\nnot made robustness their primary training outcome. To this end, we propose the\nViewMix augmentation policy, specially designed for self-supervised learning,\nupon generating different views of the same image, patches are cut and pasted\nfrom one view to another. By leveraging the different views created by this\naugmentation strategy, multiple joint embedding-based self-supervised\nmethodologies obtained better localization capability and consistently\noutperformed their corresponding baseline methods. It is also demonstrated that\nincorporating ViewMix augmentation policy promotes robustness of the\nrepresentations in the state-of-the-art methods. Furthermore, our\nexperimentation and analysis of compute times suggest that ViewMix augmentation\ndoesn't introduce any additional overhead compared to other counterparts.\n","authors":["Arjon Das","Xin Zhong"],"pdf_url":"https://arxiv.org/pdf/2309.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03354v1","updated":"2023-09-06T20:38:04Z","published":"2023-09-06T20:38:04Z","title":"Ensemble linear interpolators: The role of ensembling","summary":" Interpolators are unstable. For example, the mininum $\\ell_2$ norm least\nsquare interpolator exhibits unbounded test errors when dealing with noisy\ndata. In this paper, we study how ensemble stabilizes and thus improves the\ngeneralization performance, measured by the out-of-sample prediction risk, of\nan individual interpolator. We focus on bagged linear interpolators, as bagging\nis a popular randomization-based ensemble method that can be implemented in\nparallel. We introduce the multiplier-bootstrap-based bagged least square\nestimator, which can then be formulated as an average of the sketched least\nsquare estimators. The proposed multiplier bootstrap encompasses the classical\nbootstrap with replacement as a special case, along with a more intriguing\nvariant which we call the Bernoulli bootstrap.\n Focusing on the proportional regime where the sample size scales\nproportionally with the feature dimensionality, we investigate the\nout-of-sample prediction risks of the sketched and bagged least square\nestimators in both underparametrized and overparameterized regimes. Our results\nreveal the statistical roles of sketching and bagging. In particular, sketching\nmodifies the aspect ratio and shifts the interpolation threshold of the minimum\n$\\ell_2$ norm estimator. However, the risk of the sketched estimator continues\nto be unbounded around the interpolation threshold due to excessive variance.\nIn stark contrast, bagging effectively mitigates this variance, leading to a\nbounded limiting out-of-sample prediction risk. To further understand this\nstability improvement property, we establish that bagging acts as a form of\nimplicit regularization, substantiated by the equivalence of the bagged\nestimator with its explicitly regularized counterpart. We also discuss several\nextensions.\n","authors":["Mingqi Wu","Qiang Sun"],"pdf_url":"https://arxiv.org/pdf/2309.03354v1.pdf","comment":"30-page main text including figures and tables, 50-page appendix"},{"id":"http://arxiv.org/abs/2309.03353v1","updated":"2023-09-06T20:36:17Z","published":"2023-09-06T20:36:17Z","title":"Source Camera Identification and Detection in Digital Videos through\n Blind Forensics","summary":" Source camera identification in digital videos is the problem of associating\nan unknown digital video with its source device, within a closed set of\npossible devices. The existing techniques in source detection of digital videos\ntry to find a fingerprint of the actual source in the video in form of PRNU\n(Photo Response Non--Uniformity), and match it against the SPN (Sensor Pattern\nNoise) of each possible device. The highest correlation indicates the correct\nsource. We investigate the problem of identifying a video source through a\nfeature based approach using machine learning. In this paper, we present a\nblind forensic technique of video source authentication and identification,\nbased on feature extraction, feature selection and subsequent source\nclassification. The main aim is to determine whether a claimed source for a\nvideo is actually its original source. If not, we identify its original source.\nOur experimental results prove the efficiency of the proposed method compared\nto traditional fingerprint based technique.\n","authors":["Venkata Udaya Sameer","Shilpa Mukhopadhyay","Ruchira Naskar","Ishaan Dali"],"pdf_url":"https://arxiv.org/pdf/2309.03353v1.pdf","comment":"Submitted to IEEE for inclusion in Xplore- Digital Library. Paper\n presented at the International Conference on Recent Trends in Computational\n Engineering & Technologies (ICRTCET 18)with Paper Id: ICRTCET-227"},{"id":"http://arxiv.org/abs/2304.06104v2","updated":"2023-09-06T20:35:11Z","published":"2023-04-12T18:37:52Z","title":"Primal-Dual Contextual Bayesian Optimization for Control System Online\n Optimization with Time-Average Constraints","summary":" This paper studies the problem of online performance optimization of\nconstrained closed-loop control systems, where both the objective and the\nconstraints are unknown black-box functions affected by exogenous time-varying\ncontextual disturbances. A primal-dual contextual Bayesian optimization\nalgorithm is proposed that achieves sublinear cumulative regret with respect to\nthe dynamic optimal solution under certain regularity conditions. Furthermore,\nthe algorithm achieves zero time-average constraint violation, ensuring that\nthe average value of the constraint function satisfies the desired constraint.\nThe method is applied to both sampled instances from Gaussian processes and a\ncontinuous stirred tank reactor parameter tuning problem; simulation results\nshow that the method simultaneously provides close-to-optimal performance and\nmaintains constraint feasibility on average. This contrasts current\nstate-of-the-art methods, which either suffer from large cumulative regret or\nsevere constraint violations for the case studies presented.\n","authors":["Wenjie Xu","Yuning Jiang","Bratislav Svetozarevic","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2304.06104v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03351v1","updated":"2023-09-06T20:24:13Z","published":"2023-09-06T20:24:13Z","title":"Using Neural Networks for Fast SAR Roughness Estimation of High\n Resolution Images","summary":" The analysis of Synthetic Aperture Radar (SAR) imagery is an important step\nin remote sensing applications, and it is a challenging problem due to its\ninherent speckle noise. One typical solution is to model the data using the\n$G_I^0$ distribution and extract its roughness information, which in turn can\nbe used in posterior imaging tasks, such as segmentation, classification and\ninterpretation. This leads to the need of quick and reliable estimation of the\nroughness parameter from SAR data, especially with high resolution images.\nUnfortunately, traditional parameter estimation procedures are slow and prone\nto estimation failures. In this work, we proposed a neural network-based\nestimation framework that first learns how to predict underlying parameters of\n$G_I^0$ samples and then can be used to estimate the roughness of unseen data.\nWe show that this approach leads to an estimator that is quicker, yields less\nestimation error and is less prone to failures than the traditional estimation\nprocedures for this problem, even when we use a simple network. More\nimportantly, we show that this same methodology can be generalized to handle\nimage inputs and, even if trained on purely synthetic data for a few seconds,\nis able to perform real time pixel-wise roughness estimation for high\nresolution real SAR imagery.\n","authors":["Li Fan","Jeova Farias Sales Rocha Neto"],"pdf_url":"https://arxiv.org/pdf/2309.03351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12191v4","updated":"2023-09-06T19:58:27Z","published":"2022-01-28T15:45:13Z","title":"Kernelized Concept Erasure","summary":" The representation space of neural models for textual data emerges in an\nunsupervised manner during training. Understanding how those representations\nencode human-interpretable concepts is a fundamental problem. One prominent\napproach for the identification of concepts in neural representations is\nsearching for a linear subspace whose erasure prevents the prediction of the\nconcept from the representations. However, while many linear erasure algorithms\nare tractable and interpretable, neural networks do not necessarily represent\nconcepts in a linear manner. To identify non-linearly encoded concepts, we\npropose a kernelization of a linear minimax game for concept erasure. We\ndemonstrate that it is possible to prevent specific non-linear adversaries from\npredicting the concept. However, the protection does not transfer to different\nnonlinear adversaries. Therefore, exhaustively erasing a non-linearly encoded\nconcept remains an open problem.\n","authors":["Shauli Ravfogel","Francisco Vargas","Yoav Goldberg","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2201.12191v4.pdf","comment":"Accepted as a long paper in EMNLP22"},{"id":"http://arxiv.org/abs/2308.08643v2","updated":"2023-09-06T19:24:44Z","published":"2023-08-16T19:36:01Z","title":"Towards Personalized Federated Learning via Heterogeneous Model\n Reassembly","summary":" This paper focuses on addressing the practical yet challenging problem of\nmodel heterogeneity in federated learning, where clients possess models with\ndifferent network structures. To track this problem, we propose a novel\nframework called pFedHR, which leverages heterogeneous model reassembly to\nachieve personalized federated learning. In particular, we approach the problem\nof heterogeneous model personalization as a model-matching optimization task on\nthe server side. Moreover, pFedHR automatically and dynamically generates\ninformative and diverse personalized candidates with minimal human\nintervention. Furthermore, our proposed heterogeneous model reassembly\ntechnique mitigates the adverse impact introduced by using public data with\ndifferent distributions from the client data to a certain extent. Experimental\nresults demonstrate that pFedHR outperforms baselines on three datasets under\nboth IID and Non-IID settings. Additionally, pFedHR effectively reduces the\nadverse impact of using different public data and dynamically generates diverse\npersonalized models in an automated manner.\n","authors":["Jiaqi Wang","Xingyi Yang","Suhan Cui","Liwei Che","Lingjuan Lyu","Dongkuan Xu","Fenglong Ma"],"pdf_url":"https://arxiv.org/pdf/2308.08643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03322v1","updated":"2023-09-06T19:05:31Z","published":"2023-09-06T19:05:31Z","title":"REBOOT: Reuse Data for Bootstrapping Efficient Real-World Dexterous\n Manipulation","summary":" Dexterous manipulation tasks involving contact-rich interactions pose a\nsignificant challenge for both model-based control systems and imitation\nlearning algorithms. The complexity arises from the need for multi-fingered\nrobotic hands to dynamically establish and break contacts, balance\nnon-prehensile forces, and control large degrees of freedom. Reinforcement\nlearning (RL) offers a promising approach due to its general applicability and\ncapacity to autonomously acquire optimal manipulation strategies. However, its\nreal-world application is often hindered by the necessity to generate a large\nnumber of samples, reset the environment, and obtain reward signals. In this\nwork, we introduce an efficient system for learning dexterous manipulation\nskills with RL to alleviate these challenges. The main idea of our approach is\nthe integration of recent advances in sample-efficient RL and replay buffer\nbootstrapping. This combination allows us to utilize data from different tasks\nor objects as a starting point for training new tasks, significantly improving\nlearning efficiency. Additionally, our system completes the real-world training\ncycle by incorporating learned resets via an imitation-based pickup policy as\nwell as learned reward functions, eliminating the need for manual resets and\nreward engineering. We demonstrate the benefits of reusing past data as replay\nbuffer initialization for new tasks, for instance, the fast acquisition of\nintricate manipulation skills in the real world on a four-fingered robotic\nhand. (Videos: https://sites.google.com/view/reboot-dexterous)\n","authors":["Zheyuan Hu","Aaron Rovinsky","Jianlan Luo","Vikash Kumar","Abhishek Gupta","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2309.03322v1.pdf","comment":"Accepted at CORL 2023. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2209.04521v2","updated":"2023-09-06T19:01:25Z","published":"2022-09-09T20:53:11Z","title":"The Space of Adversarial Strategies","summary":" Adversarial examples, inputs designed to induce worst-case behavior in\nmachine learning models, have been extensively studied over the past decade.\nYet, our understanding of this phenomenon stems from a rather fragmented pool\nof knowledge; at present, there are a handful of attacks, each with disparate\nassumptions in threat models and incomparable definitions of optimality. In\nthis paper, we propose a systematic approach to characterize worst-case (i.e.,\noptimal) adversaries. We first introduce an extensible decomposition of attacks\nin adversarial machine learning by atomizing attack components into surfaces\nand travelers. With our decomposition, we enumerate over components to create\n576 attacks (568 of which were previously unexplored). Next, we propose the\nPareto Ensemble Attack (PEA): a theoretical attack that upper-bounds attack\nperformance. With our new attacks, we measure performance relative to the PEA\non: both robust and non-robust models, seven datasets, and three extended\nlp-based threat models incorporating compute costs, formalizing the Space of\nAdversarial Strategies. From our evaluation we find that attack performance to\nbe highly contextual: the domain, model robustness, and threat model can have a\nprofound influence on attack efficacy. Our investigation suggests that future\nstudies measuring the security of machine learning should: (1) be\ncontextualized to the domain & threat models, and (2) go beyond the handful of\nknown attacks used today.\n","authors":["Ryan Sheatsley","Blaine Hoak","Eric Pauley","Patrick McDaniel"],"pdf_url":"https://arxiv.org/pdf/2209.04521v2.pdf","comment":"Accepted to the 32nd USENIX Security Symposium"},{"id":"http://arxiv.org/abs/2309.03318v1","updated":"2023-09-06T18:58:21Z","published":"2023-09-06T18:58:21Z","title":"Fitness Approximation through Machine Learning","summary":" We present a novel approach to performing fitness approximation in genetic\nalgorithms (GAs) using machine-learning (ML) models, focusing on evolutionary\nagents in Gymnasium (game) simulators -- where fitness computation is costly.\nMaintaining a dataset of sampled individuals along with their actual fitness\nscores, we continually update throughout an evolutionary run a\nfitness-approximation ML model. We compare different methods for: 1) switching\nbetween actual and approximate fitness, 2) sampling the population, and 3)\nweighting the samples. Experimental findings demonstrate significant\nimprovement in evolutionary runtimes, with fitness scores that are either\nidentical or slightly lower than that of the fully run GA -- depending on the\nratio of approximate-to-actual-fitness computation. Our approach is generic and\ncan be easily applied to many different domains.\n","authors":["Itai Tzruia","Tomer Halperin","Moshe Sipper","Achiya Elyasaf"],"pdf_url":"https://arxiv.org/pdf/2309.03318v1.pdf","comment":"9 pages, 5 tables, 2 figures. Submitted to IEEE Transactions on\n Emerging Topics in Computational Intelligence"},{"id":"http://arxiv.org/abs/2309.03315v1","updated":"2023-09-06T18:56:20Z","published":"2023-09-06T18:56:20Z","title":"Robotic Table Tennis: A Case Study into a High Speed Learning System","summary":" We present a deep-dive into a real-world robotic learning system that, in\nprevious work, was shown to be capable of hundreds of table tennis rallies with\na human and has the ability to precisely return the ball to desired targets.\nThis system puts together a highly optimized perception subsystem, a high-speed\nlow-latency robot controller, a simulation paradigm that can prevent damage in\nthe real world and also train policies for zero-shot transfer, and automated\nreal world environment resets that enable autonomous training and evaluation on\nphysical robots. We complement a complete system description, including\nnumerous design decisions that are typically not widely disseminated, with a\ncollection of studies that clarify the importance of mitigating various sources\nof latency, accounting for training and deployment distribution shifts,\nrobustness of the perception system, sensitivity to policy hyper-parameters,\nand choice of action space. A video demonstrating the components of the system\nand details of experimental results can be found at\nhttps://youtu.be/uFcnWjB42I0.\n","authors":["David B. D'Ambrosio","Jonathan Abelian","Saminda Abeyruwan","Michael Ahn","Alex Bewley","Justin Boyd","Krzysztof Choromanski","Omar Cortes","Erwin Coumans","Tianli Ding","Wenbo Gao","Laura Graesser","Atil Iscen","Navdeep Jaitly","Deepali Jain","Juhana Kangaspunta","Satoshi Kataoka","Gus Kouretas","Yuheng Kuang","Nevena Lazic","Corey Lynch","Reza Mahjourian","Sherry Q. Moore","Thinh Nguyen","Ken Oslund","Barney J Reed","Krista Reymann","Pannag R. Sanketi","Anish Shankar","Pierre Sermanet","Vikas Sindhwani","Avi Singh","Vincent Vanhoucke","Grace Vesom","Peng Xu"],"pdf_url":"https://arxiv.org/pdf/2309.03315v1.pdf","comment":"Published and presented at Robotics: Science and Systems (RSS2023)"}],"Multimedia":[{"id":"http://arxiv.org/abs/2305.03472v2","updated":"2023-09-06T16:14:44Z","published":"2023-05-05T12:29:22Z","title":"Generative Steganography Diffusion","summary":" Generative steganography (GS) is an emerging technique that generates stego\nimages directly from secret data. Various GS methods based on GANs or Flow have\nbeen developed recently. However, existing GAN-based GS methods cannot\ncompletely recover the hidden secret data due to the lack of network\ninvertibility, while Flow-based methods produce poor image quality due to the\nstringent reversibility restriction in each module. To address this issue, we\npropose a novel GS scheme called \"Generative Steganography Diffusion\" (GSD) by\ndevising an invertible diffusion model named \"StegoDiffusion\". It not only\ngenerates realistic stego images but also allows for 100\\% recovery of the\nhidden secret data. The proposed StegoDiffusion model leverages a non-Markov\nchain with a fast sampling technique to achieve efficient stego image\ngeneration. By constructing an ordinary differential equation (ODE) based on\nthe transition probability of the generation process in StegoDiffusion, secret\ndata and stego images can be converted to each other through the approximate\nsolver of ODE -- Euler iteration formula, enabling the use of irreversible but\nmore expressive network structures to achieve model invertibility. Our proposed\nGSD has the advantages of both reversibility and high performance,\nsignificantly outperforming existing GS methods in all metrics.\n","authors":["Ping Wei","Qing Zhou","Zichi Wang","Zhenxing Qian","Xinpeng Zhang","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2305.03472v2.pdf","comment":"Shall not be reproduced without permission, rights reserved!"},{"id":"http://arxiv.org/abs/2309.03100v1","updated":"2023-09-06T15:40:33Z","published":"2023-09-06T15:40:33Z","title":"FArMARe: a Furniture-Aware Multi-task methodology for Recommending\n Apartments based on the user interests","summary":" Nowadays, many people frequently have to search for new accommodation\noptions. Searching for a suitable apartment is a time-consuming process,\nespecially because visiting them is often mandatory to assess the truthfulness\nof the advertisements found on the Web. While this process could be alleviated\nby visiting the apartments in the metaverse, the Web-based recommendation\nplatforms are not suitable for the task. To address this shortcoming, in this\npaper, we define a new problem called text-to-apartment recommendation, which\nrequires ranking the apartments based on their relevance to a textual query\nexpressing the user's interests. To tackle this problem, we introduce FArMARe,\na multi-task approach that supports cross-modal contrastive training with a\nfurniture-aware objective. Since public datasets related to indoor scenes do\nnot contain detailed descriptions of the furniture, we collect and annotate a\ndataset comprising more than 6000 apartments. A thorough experimentation with\nthree different methods and two raw feature extraction procedures reveals the\neffectiveness of FArMARe in dealing with the problem at hand.\n","authors":["Ali Abdari","Alex Falcon","Giuseppe Serra"],"pdf_url":"https://arxiv.org/pdf/2309.03100v1.pdf","comment":"accepted for presentation at the ICCV2023 CV4Metaverse workshop"},{"id":"http://arxiv.org/abs/2309.03071v1","updated":"2023-09-06T15:18:35Z","published":"2023-09-06T15:18:35Z","title":"Disarming Steganography Attacks Inside Neural Network Models","summary":" Similar to the revolution of open source code sharing, Artificial\nIntelligence (AI) model sharing is gaining increased popularity. However, the\nfast adaptation in the industry, lack of awareness, and ability to exploit the\nmodels make them significant attack vectors. By embedding malware in neurons,\nthe malware can be delivered covertly, with minor or no impact on the neural\nnetwork's performance. The covert attack will use the Least Significant Bits\n(LSB) weight attack since LSB has a minimal effect on the model accuracy, and\nas a result, the user will not notice it. Since there are endless ways to hide\nthe attacks, we focus on a zero-trust prevention strategy based on AI model\nattack disarm and reconstruction. We proposed three types of model\nsteganography weight disarm defense mechanisms. The first two are based on\nrandom bit substitution noise, and the other on model weight quantization. We\ndemonstrate a 100\\% prevention rate while the methods introduce a minimal\ndecrease in model accuracy based on Qint8 and K-LRBP methods, which is an\nessential factor for improving AI security.\n","authors":["Ran Dubin"],"pdf_url":"https://arxiv.org/pdf/2309.03071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.14806v2","updated":"2023-09-06T15:13:07Z","published":"2022-03-28T14:44:52Z","title":"Extraction of Visual Information to Predict Crowdfunding Success","summary":" Researchers have increasingly turned to crowdfunding platforms to gain\ninsights into entrepreneurial activity and dynamics. While previous studies\nhave explored various factors influencing crowdfunding success, such as\ntechnology, communication, and marketing strategies, the role of visual\nelements that can be automatically extracted from images has received less\nattention. This is surprising, considering that crowdfunding platforms\nemphasize the importance of attention-grabbing and high-resolution images, and\nprevious research has shown that image characteristics can significantly impact\nproduct evaluations. Indeed, a comprehensive review of empirical articles (n =\n202) that utilized Kickstarter data, focusing on the incorporation of visual\ninformation in their analyses. Our findings reveal that only 29.70% controlled\nfor the number of images, and less than 12% considered any image details. In\nthis manuscript, we review the literature on image processing and its relevance\nto the business domain, highlighting two types of visual variables: visual\ncounts (number of pictures and number of videos) and image details. Building\nupon previous work that discussed the role of color, composition and\nfigure-ground relationships, we introduce visual scene elements that have not\nyet been explored in crowdfunding, including the number of faces, the number of\nconcepts depicted, and the ease of identifying those concepts. To demonstrate\nthe predictive value of visual counts and image details, we analyze Kickstarter\ndata. Our results highlight that visual count features are two of the top three\npredictors of success. Our results also show that simple image detail features\nsuch as color matter a lot, and our proposed measures of visual scene elements\ncan also be useful. We supplement our article with R and Python codes that help\nauthors extract image details (https://osf.io/ujnzp/).\n","authors":["S. J. Blanchard","T. J. Noseworthy","E. Pancer","M. Poole"],"pdf_url":"https://arxiv.org/pdf/2203.14806v2.pdf","comment":"32 pages, 5 figures"},{"id":"http://arxiv.org/abs/2306.17125v2","updated":"2023-09-06T07:30:33Z","published":"2023-06-29T17:31:33Z","title":"Ducho: A Unified Framework for the Extraction of Multimodal Features in\n Recommendation","summary":" In multimodal-aware recommendation, the extraction of meaningful multimodal\nfeatures is at the basis of high-quality recommendations. Generally, each\nrecommendation framework implements its multimodal extraction procedures with\nspecific strategies and tools. This is limiting for two reasons: (i) different\nextraction strategies do not ease the interdependence among multimodal\nrecommendation frameworks; thus, they cannot be efficiently and fairly\ncompared; (ii) given the large plethora of pre-trained deep learning models\nmade available by different open source tools, model designers do not have\naccess to shared interfaces to extract features. Motivated by the outlined\naspects, we propose \\framework, a unified framework for the extraction of\nmultimodal features in recommendation. By integrating three widely-adopted deep\nlearning libraries as backends, namely, TensorFlow, PyTorch, and Transformers,\nwe provide a shared interface to extract and process features where each\nbackend's specific methods are abstracted to the end user. Noteworthy, the\nextraction pipeline is easily configurable with a YAML-based file where the\nuser can specify, for each modality, the list of models (and their specific\nbackends/parameters) to perform the extraction. Finally, to make \\framework\naccessible to the community, we build a public Docker image equipped with a\nready-to-use CUDA environment and propose three demos to test its\nfunctionalities for different scenarios and tasks. The GitHub repository and\nthe documentation are accessible at this link:\nhttps://github.com/sisinflab/Ducho.\n","authors":["Daniele Malitesta","Giuseppe Gassi","Claudio Pomo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2306.17125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00462v2","updated":"2023-09-06T04:10:25Z","published":"2023-08-01T11:38:50Z","title":"Context-Aware Talking-Head Video Editing","summary":" Talking-head video editing aims to efficiently insert, delete, and substitute\nthe word of a pre-recorded video through a text transcript editor. The key\nchallenge for this task is obtaining an editing model that generates new\ntalking-head video clips which simultaneously have accurate lip synchronization\nand motion smoothness. Previous approaches, including 3DMM-based (3D Morphable\nModel) methods and NeRF-based (Neural Radiance Field) methods, are sub-optimal\nin that they either require minutes of source videos and days of training time\nor lack the disentangled control of verbal (e.g., lip motion) and non-verbal\n(e.g., head pose and expression) representations for video clip insertion. In\nthis work, we fully utilize the video context to design a novel framework for\ntalking-head video editing, which achieves efficiency, disentangled motion\ncontrol, and sequential smoothness. Specifically, we decompose this framework\nto motion prediction and motion-conditioned rendering: (1) We first design an\nanimation prediction module that efficiently obtains smooth and lip-sync motion\nsequences conditioned on the driven speech. This module adopts a\nnon-autoregressive network to obtain context prior and improve the prediction\nefficiency, and it learns a speech-animation mapping prior with better\ngeneralization to novel speech from a multi-identity video dataset. (2) We then\nintroduce a neural rendering module to synthesize the photo-realistic and\nfull-head video frames given the predicted motion sequence. This module adopts\na pre-trained head topology and uses only few frames for efficient fine-tuning\nto obtain a person-specific rendering model. Extensive experiments demonstrate\nthat our method efficiently achieves smoother editing results with higher image\nquality and lip accuracy using less data than previous methods.\n","authors":["Songlin Yang","Wei Wang","Jun Ling","Bo Peng","Xu Tan","Jing Dong"],"pdf_url":"https://arxiv.org/pdf/2308.00462v2.pdf","comment":"needs some improvements"},{"id":"http://arxiv.org/abs/2208.05318v2","updated":"2023-09-06T02:29:01Z","published":"2022-08-10T12:55:56Z","title":"Generative Action Description Prompts for Skeleton-based Action\n Recognition","summary":" Skeleton-based action recognition has recently received considerable\nattention. Current approaches to skeleton-based action recognition are\ntypically formulated as one-hot classification tasks and do not fully exploit\nthe semantic relations between actions. For example, \"make victory sign\" and\n\"thumb up\" are two actions of hand gestures, whose major difference lies in the\nmovement of hands. This information is agnostic from the categorical one-hot\nencoding of action classes but could be unveiled from the action description.\nTherefore, utilizing action description in training could potentially benefit\nrepresentation learning. In this work, we propose a Generative\nAction-description Prompts (GAP) approach for skeleton-based action\nrecognition. More specifically, we employ a pre-trained large-scale language\nmodel as the knowledge engine to automatically generate text descriptions for\nbody parts movements of actions, and propose a multi-modal training scheme by\nutilizing the text encoder to generate feature vectors for different body parts\nand supervise the skeleton encoder for action representation learning.\nExperiments show that our proposed GAP method achieves noticeable improvements\nover various baseline models without extra computation cost at inference. GAP\nachieves new state-of-the-arts on popular skeleton-based action recognition\nbenchmarks, including NTU RGB+D, NTU RGB+D 120 and NW-UCLA. The source code is\navailable at https://github.com/MartinXM/GAP.\n","authors":["Wangmeng Xiang","Chao Li","Yuxuan Zhou","Biao Wang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.05318v2.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2309.03340v1","updated":"2023-09-06T19:42:52Z","published":"2023-09-06T19:42:52Z","title":"Parameter Efficient Audio Captioning With Faithful Guidance Using\n Audio-text Shared Latent Representation","summary":" There has been significant research on developing pretrained transformer\narchitectures for multimodal-to-text generation tasks. Albeit performance\nimprovements, such models are frequently overparameterized, hence suffer from\nhallucination and large memory footprint making them challenging to deploy on\nedge devices. In this paper, we address both these issues for the application\nof automated audio captioning. First, we propose a data augmentation technique\nfor generating hallucinated audio captions and show that similarity based on an\naudio-text shared latent space is suitable for detecting hallucination. Then,\nwe propose a parameter efficient inference time faithful decoding algorithm\nthat enables smaller audio captioning models with performance equivalent to\nlarger models trained with more data. During the beam decoding step, the\nsmaller model utilizes an audio-text shared latent representation to\nsemantically align the generated text with corresponding input audio. Faithful\nguidance is introduced into the beam probability by incorporating the cosine\nsimilarity between latent representation projections of greedy rolled out\nintermediate beams and audio clip. We show the efficacy of our algorithm on\nbenchmark datasets and evaluate the proposed scheme against baselines using\nconventional audio captioning and semantic similarity metrics while\nillustrating tradeoffs between performance and complexity.\n","authors":["Arvind Krishna Sridhar","Yinyi Guo","Erik Visser","Rehana Mahfuz"],"pdf_url":"https://arxiv.org/pdf/2309.03340v1.pdf","comment":"5 pages, 5 tables, 1 figure"},{"id":"http://arxiv.org/abs/2309.03326v1","updated":"2023-09-06T19:17:46Z","published":"2023-09-06T19:17:46Z","title":"Detecting False Alarms and Misses in Audio Captions","summary":" Metrics to evaluate audio captions simply provide a score without much\nexplanation regarding what may be wrong in case the score is low. Manual human\nintervention is needed to find any shortcomings of the caption. In this work,\nwe introduce a metric which automatically identifies the shortcomings of an\naudio caption by detecting the misses and false alarms in a candidate caption\nwith respect to a reference caption, and reports the recall, precision and\nF-score. Such a metric is very useful in profiling the deficiencies of an audio\ncaptioning model, which is a milestone towards improving the quality of audio\ncaptions.\n","authors":["Rehana Mahfuz","Yinyi Guo","Arvind Krishna Sridhar","Erik Visser"],"pdf_url":"https://arxiv.org/pdf/2309.03326v1.pdf","comment":null}]},"2023-09-07T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.03905v1","updated":"2023-09-07T17:59:45Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v1.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"},{"id":"http://arxiv.org/abs/2308.16898v2","updated":"2023-09-07T17:50:52Z","published":"2023-08-31T17:57:50Z","title":"Transformers as Support Vector Machines","summary":" Since its inception in \"Attention Is All You Need\", transformer architecture\nhas led to revolutionary advancements in NLP. The attention layer within the\ntransformer admits a sequence of input tokens $X$ and makes them interact\nthrough pairwise similarities computed as softmax$(XQK^\\top X^\\top)$, where\n$(K,Q)$ are the trainable key-query parameters. In this work, we establish a\nformal equivalence between the optimization geometry of self-attention and a\nhard-margin SVM problem that separates optimal input tokens from non-optimal\ntokens using linear constraints on the outer-products of token pairs. This\nformalism allows us to characterize the implicit bias of 1-layer transformers\noptimized with gradient descent: (1) Optimizing the attention layer with\nvanishing regularization, parameterized by $(K,Q)$, converges in direction to\nan SVM solution minimizing the nuclear norm of the combined parameter\n$W=KQ^\\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm\nobjective. We characterize this convergence, highlighting that it can occur\ntoward locally-optimal directions rather than global ones. (2) Complementing\nthis, we prove the local/global directional convergence of gradient descent\nunder suitable geometric conditions. Importantly, we show that\nover-parameterization catalyzes global convergence by ensuring the feasibility\nof the SVM problem and by guaranteeing a benign optimization landscape devoid\nof stationary points. (3) While our theory applies primarily to linear\nprediction heads, we propose a more general SVM equivalence that predicts the\nimplicit bias with nonlinear heads. Our findings are applicable to arbitrary\ndatasets and their validity is verified via experiments. We also introduce\nseveral open problems and research directions. We believe these findings\ninspire the interpretation of transformers as a hierarchy of SVMs that\nseparates and selects optimal tokens.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Christos Thrampoulidis","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2308.16898v2.pdf","comment":"minor edits and update global convergence figure"},{"id":"http://arxiv.org/abs/2309.03886v1","updated":"2023-09-07T17:47:26Z","published":"2023-09-07T17:47:26Z","title":"A Function Interpretation Benchmark for Evaluating Interpretability\n Methods","summary":" Labeling neural network submodules with human-legible descriptions is useful\nfor many downstream tasks: such descriptions can surface failures, guide\ninterventions, and perhaps even explain important model behaviors. To date,\nmost mechanistic descriptions of trained networks have involved small models,\nnarrowly delimited phenomena, and large amounts of human labor. Labeling all\nhuman-interpretable sub-computations in models of increasing size and\ncomplexity will almost certainly require tools that can generate and validate\ndescriptions automatically. Recently, techniques that use learned models\nin-the-loop for labeling have begun to gain traction, but methods for\nevaluating their efficacy are limited and ad-hoc. How should we validate and\ncompare open-ended labeling tools? This paper introduces FIND (Function\nINterpretation and Description), a benchmark suite for evaluating the building\nblocks of automated interpretability methods. FIND contains functions that\nresemble components of trained neural networks, and accompanying descriptions\nof the kind we seek to generate. The functions are procedurally constructed\nacross textual and numeric domains, and involve a range of real-world\ncomplexities, including noise, composition, approximation, and bias. We\nevaluate new and existing methods that use language models (LMs) to produce\ncode-based and language descriptions of function behavior. We find that an\noff-the-shelf LM augmented with only black-box access to functions can\nsometimes infer their structure, acting as a scientist by forming hypotheses,\nproposing experiments, and updating descriptions in light of new data. However,\nLM-based descriptions tend to capture global function behavior and miss local\ncorruptions. These results show that FIND will be useful for characterizing the\nperformance of more sophisticated interpretability methods before they are\napplied to real-world models.\n","authors":["Sarah Schwettmann","Tamar Rott Shaham","Joanna Materzynska","Neil Chowdhury","Shuang Li","Jacob Andreas","David Bau","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2309.03886v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.03884v1","updated":"2023-09-07T17:45:58Z","published":"2023-09-07T17:45:58Z","title":"Zero-Shot Audio Captioning via Audibility Guidance","summary":" The task of audio captioning is similar in essence to tasks such as image and\nvideo captioning. However, it has received much less attention. We propose\nthree desiderata for captioning audio -- (i) fluency of the generated text,\n(ii) faithfulness of the generated text to the input audio, and the somewhat\nrelated (iii) audibility, which is the quality of being able to be perceived\nbased only on audio. Our method is a zero-shot method, i.e., we do not learn to\nperform captioning. Instead, captioning occurs as an inference process that\ninvolves three networks that correspond to the three desired qualities: (i) A\nLarge Language Model, in our case, for reasons of convenience, GPT-2, (ii) A\nmodel that provides a matching score between an audio file and a text, for\nwhich we use a multimodal matching network called ImageBind, and (iii) A text\nclassifier, trained using a dataset we collected automatically by instructing\nGPT-4 with prompts designed to direct the generation of both audible and\ninaudible sentences. We present our results on the AudioCap dataset,\ndemonstrating that audibility guidance significantly enhances performance\ncompared to the baseline, which lacks this objective.\n","authors":["Tal Shaharabany","Ariel Shaulov","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2309.03884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03883v1","updated":"2023-09-07T17:45:31Z","published":"2023-09-07T17:45:31Z","title":"DoLa: Decoding by Contrasting Layers Improves Factuality in Large\n Language Models","summary":" Despite their impressive capabilities, large language models (LLMs) are prone\nto hallucinations, i.e., generating content that deviates from facts seen\nduring pretraining. We propose a simple decoding strategy for reducing\nhallucinations with pretrained LLMs that does not require conditioning on\nretrieved external knowledge nor additional fine-tuning. Our approach obtains\nthe next-token distribution by contrasting the differences in logits obtained\nfrom projecting the later layers versus earlier layers to the vocabulary space,\nexploiting the fact that factual knowledge in an LLMs has generally been shown\nto be localized to particular transformer layers. We find that this Decoding by\nContrasting Layers (DoLa) approach is able to better surface factual knowledge\nand reduce the generation of incorrect facts. DoLa consistently improves the\ntruthfulness across multiple choices tasks and open-ended generation tasks, for\nexample improving the performance of LLaMA family models on TruthfulQA by\n12-17% absolute points, demonstrating its potential in making LLMs reliably\ngenerate truthful facts.\n","authors":["Yung-Sung Chuang","Yujia Xie","Hongyin Luo","Yoon Kim","James Glass","Pengcheng He"],"pdf_url":"https://arxiv.org/pdf/2309.03883v1.pdf","comment":"The source code is available at https://github.com/voidism/DoLa"},{"id":"http://arxiv.org/abs/2309.03882v1","updated":"2023-09-07T17:44:56Z","published":"2023-09-07T17:44:56Z","title":"On Large Language Models' Selection Bias in Multi-Choice Questions","summary":" Multi-choice questions (MCQs) serve as a common yet important task format in\nthe research of large language models (LLMs). Our work shows that LLMs exhibit\nan inherent \"selection bias\" in MCQs, which refers to LLMs' preferences to\nselect options located at specific positions (like \"Option C\"). This bias is\nprevalent across various LLMs, making their performance vulnerable to option\nposition changes in MCQs. We identify that one primary cause resulting in\nselection bias is option numbering, i.e., the ID symbols A/B/C/D associated\nwith the options. To mitigate selection bias, we propose a new method called\nPriDe. PriDe first decomposes the observed model prediction distribution into\nan intrinsic prediction over option contents and a prior distribution over\noption IDs. It then estimates the prior by permutating option contents on a\nsmall number of test samples, which is used to debias the subsequent test\nsamples. We demonstrate that, as a label-free, inference-time method, PriDe\nachieves a more effective and computation-efficient debiasing than strong\nbaselines. We further show that the priors estimated by PriDe generalize well\nacross different domains, highlighting its practical potential in broader\nscenarios.\n","authors":["Chujie Zheng","Hao Zhou","Fandong Meng","Jie Zhou","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2309.03882v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.03877v1","updated":"2023-09-07T17:41:41Z","published":"2023-09-07T17:41:41Z","title":"Introducing \"Forecast Utterance\" for Conversational Data Science","summary":" Envision an intelligent agent capable of assisting users in conducting\nforecasting tasks through intuitive, natural conversations, without requiring\nin-depth knowledge of the underlying machine learning (ML) processes. A\nsignificant challenge for the agent in this endeavor is to accurately\ncomprehend the user's prediction goals and, consequently, formulate precise ML\ntasks. In this paper, we take a pioneering step towards this ambitious goal by\nintroducing a new concept called Forecast Utterance and then focus on the\nautomatic and accurate interpretation of users' prediction goals from these\nutterances. Specifically, we frame the task as a slot-filling problem, where\neach slot corresponds to a specific aspect of the goal prediction task. We then\nemploy two zero-shot methods for solving the slot-filling task, namely: 1)\nEntity Extraction (EE), and 2) Question-Answering (QA) techniques. Our\nexperiments, conducted with three meticulously crafted data sets, validate the\nviability of our ambitious goal and demonstrate the effectiveness of both EE\nand QA techniques in interpreting Forecast Utterances.\n","authors":["Md Mahadi Hassan","Alex Knipper","Shubhra Kanti Karmaker"],"pdf_url":"https://arxiv.org/pdf/2309.03877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03876v1","updated":"2023-09-07T17:41:01Z","published":"2023-09-07T17:41:01Z","title":"OpinionGPT: Modelling Explicit Biases in Instruction-Tuned LLMs","summary":" Instruction-tuned Large Language Models (LLMs) have recently showcased\nremarkable ability to generate fitting responses to natural language\ninstructions. However, an open research question concerns the inherent biases\nof trained models and their responses. For instance, if the data used to tune\nan LLM is dominantly written by persons with a specific political bias, we\nmight expect generated answers to share this bias. Current research work seeks\nto de-bias such models, or suppress potentially biased answers. With this\ndemonstration, we take a different view on biases in instruction-tuning: Rather\nthan aiming to suppress them, we aim to make them explicit and transparent. To\nthis end, we present OpinionGPT, a web demo in which users can ask questions\nand select all biases they wish to investigate. The demo will answer this\nquestion using a model fine-tuned on text representing each of the selected\nbiases, allowing side-by-side comparison. To train the underlying model, we\nidentified 11 different biases (political, geographic, gender, age) and derived\nan instruction-tuning corpus in which each answer was written by members of one\nof these demographics. This paper presents OpinionGPT, illustrates how we\ntrained the bias-aware model and showcases the web application (available at\nhttps://opiniongpt.informatik.hu-berlin.de).\n","authors":["Patrick Haller","Ansar Aynetdinov","Alan Akbik"],"pdf_url":"https://arxiv.org/pdf/2309.03876v1.pdf","comment":"6 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2306.13596v3","updated":"2023-09-07T17:24:08Z","published":"2023-06-23T16:35:46Z","title":"Max-Margin Token Selection in Attention Mechanism","summary":" Attention mechanism is a central component of the transformer architecture\nwhich led to the phenomenal success of large language models. However, the\ntheoretical principles underlying the attention mechanism are poorly\nunderstood, especially its nonconvex optimization dynamics. In this work, we\nexplore the seminal softmax-attention model $f(\\boldsymbol{X})=\\langle\n\\boldsymbol{Xv}, \\texttt{softmax}(\\boldsymbol{XWp})\\rangle$, where\n$\\boldsymbol{X}$ is the token sequence and\n$(\\boldsymbol{v},\\boldsymbol{W},\\boldsymbol{p})$ are trainable parameters. We\nprove that running gradient descent on $\\boldsymbol{p}$, or equivalently\n$\\boldsymbol{W}$, converges in direction to a max-margin solution that\nseparates $\\textit{locally-optimal}$ tokens from non-optimal ones. This clearly\nformalizes attention as an optimal token selection mechanism. Remarkably, our\nresults are applicable to general data and precisely characterize\n$\\textit{optimality}$ of tokens in terms of the value embeddings\n$\\boldsymbol{Xv}$ and problem geometry. We also provide a broader\nregularization path analysis that establishes the margin maximizing nature of\nattention even for nonlinear prediction heads. When optimizing $\\boldsymbol{v}$\nand $\\boldsymbol{p}$ simultaneously with logistic loss, we identify conditions\nunder which the regularization paths directionally converge to their respective\nhard-margin SVM solutions where $\\boldsymbol{v}$ separates the input features\nbased on their labels. Interestingly, the SVM formulation of $\\boldsymbol{p}$\nis influenced by the support vector geometry of $\\boldsymbol{v}$. Finally, we\nverify our theoretical findings via numerical experiments and provide insights.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Xuechen Zhang","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2306.13596v3.pdf","comment":"minor edits and update convergence analysis figure"},{"id":"http://arxiv.org/abs/2309.03852v1","updated":"2023-09-07T17:07:36Z","published":"2023-09-07T17:07:36Z","title":"FLM-101B: An Open LLM and How to Train It with $100K Budget","summary":" Large language models (LLMs) have achieved remarkable success in NLP and\nmultimodal tasks. Despite these successes, their development faces two main\nchallenges: (i) high computational cost; and (ii) difficulty in conducting fair\nand objective evaluations. LLMs are prohibitively expensive, making it feasible\nfor only a few major players to undertake their training, thereby constraining\nboth research and application opportunities. This underscores the importance of\ncost-effective LLM training. In this paper, we utilize a growth strategy to\nsignificantly reduce LLM training cost. We demonstrate that an LLM with 101B\nparameters and 0.31TB tokens can be trained on a $100K budget. We also adopt a\nsystematic evaluation paradigm for the IQ evaluation of LLMs, in complement to\nexisting evaluations that focus more on knowledge-oriented abilities. We\nintroduce our benchmark including evaluations on important aspects of\nintelligence including symbolic mapping, itrule understanding, pattern mining,\nand anti-interference. Such evaluations minimize the potential impact of\nmemorization. Experimental results show that our model FLM-101B, trained with a\nbudget of $100K, achieves comparable performance to powerful and well-known\nmodels, eg GPT-3 and GLM-130B, especially in the IQ benchmark evaluations with\ncontexts unseen in training data. The checkpoint of FLM-101B will be\nopen-sourced at https://huggingface.co/CofeAI/FLM-101B.\n","authors":["Xiang Li","Yiqun Yao","Xin Jiang","Xuezhi Fang","Xuying Meng","Siqi Fan","Peng Han","Jing Li","Li Du","Bowen Qin","Zheng Zhang","Aixin Sun","Yequan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03831v1","updated":"2023-09-07T16:45:42Z","published":"2023-09-07T16:45:42Z","title":"Uncovering Drift in Textual Data: An Unsupervised Method for Detecting\n and Mitigating Drift in Machine Learning Models","summary":" Drift in machine learning refers to the phenomenon where the statistical\nproperties of data or context, in which the model operates, change over time\nleading to a decrease in its performance. Therefore, maintaining a constant\nmonitoring process for machine learning model performance is crucial in order\nto proactively prevent any potential performance regression. However,\nsupervised drift detection methods require human annotation and consequently\nlead to a longer time to detect and mitigate the drift. In our proposed\nunsupervised drift detection method, we follow a two step process. Our first\nstep involves encoding a sample of production data as the target distribution,\nand the model training data as the reference distribution. In the second step,\nwe employ a kernel-based statistical test that utilizes the maximum mean\ndiscrepancy (MMD) distance metric to compare the reference and target\ndistributions and estimate any potential drift. Our method also identifies the\nsubset of production data that is the root cause of the drift. The models\nretrained using these identified high drift samples show improved performance\non online customer experience quality metrics.\n","authors":["Saeed Khaki","Akhouri Abhinav Aditya","Zohar Karnin","Lan Ma","Olivia Pan","Samarth Marudheri Chandrashekar"],"pdf_url":"https://arxiv.org/pdf/2309.03831v1.pdf","comment":"8 pages, Accepted in 2023 Amazon Internal Machine Learning Conference"},{"id":"http://arxiv.org/abs/2309.03787v1","updated":"2023-09-07T15:35:00Z","published":"2023-09-07T15:35:00Z","title":"USA: Universal Sentiment Analysis Model & Construction of Japanese\n Sentiment Text Classification and Part of Speech Dataset","summary":" Sentiment analysis is a pivotal task in the domain of natural language\nprocessing. It encompasses both text-level sentiment polarity classification\nand word-level Part of Speech(POS) sentiment polarity determination. Such\nanalysis challenges models to understand text holistically while also\nextracting nuanced information. With the rise of Large Language Models(LLMs),\nnew avenues for sentiment analysis have opened. This paper proposes enhancing\nperformance by leveraging the Mutual Reinforcement Effect(MRE) between\nindividual words and the overall text. It delves into how word polarity\ninfluences the overarching sentiment of a passage. To support our research, we\nannotated four novel Sentiment Text Classification and Part of Speech(SCPOS)\ndatasets, building upon existing sentiment classification datasets.\nFurthermore, we developed a Universal Sentiment Analysis(USA) model, with a\n7-billion parameter size. Experimental results revealed that our model\nsurpassed the performance of gpt-3.5-turbo across all four datasets,\nunderscoring the significance of MRE in sentiment analysis.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2309.03787v1.pdf","comment":"10 pages, 5 figures. Model and Dataset will release soon"},{"id":"http://arxiv.org/abs/2308.16137v3","updated":"2023-09-07T15:04:04Z","published":"2023-08-30T16:47:51Z","title":"LM-Infinite: Simple On-the-Fly Length Generalization for Large Language\n Models","summary":" In recent years, there have been remarkable advancements in the performance\nof Transformer-based Large Language Models (LLMs) across various domains. As\nthese LLMs are deployed for increasingly complex tasks, they often face the\nneed to conduct longer reasoning processes or understand larger contexts. In\nthese situations, the length generalization failure of LLMs on long sequences\nbecomes more prominent. Most pre-training schemes truncate training sequences\nto a fixed length. LLMs often struggle to generate fluent and coherent texts,\nlet alone carry out downstream tasks, after longer contexts, even with relative\npositional encoding designed to cope with this problem. Common solutions such\nas finetuning on longer corpora often involve daunting hardware and time costs\nand require careful training process design. To more efficiently leverage the\ngeneration capacity of existing LLMs, we theoretically and empirically\ninvestigate the main out-of-distribution (OOD) factors contributing to this\nproblem. Inspired by this diagnosis, we propose a simple yet effective solution\nfor on-the-fly length generalization, LM-Infinite. It involves only a\n$\\Lambda$-shaped attention mask (to avoid excessive attended tokens) and a\ndistance limit (to avoid unseen distances) while requiring no parameter updates\nor learning. We find it applicable to a variety of LLMs using relative-position\nencoding methods. LM-Infinite is computationally efficient with $O(n)$ time and\nspace, and demonstrates consistent text generation fluency and quality to as\nlong as 32k tokens on ArXiv and OpenWebText2 datasets, with 2.72x decoding\nspeedup. On downstream tasks such as passkey retrieval, it continues to work on\ninputs much longer than training lengths where vanilla models fail immediately.\n","authors":["Chi Han","Qifan Wang","Wenhan Xiong","Yu Chen","Heng Ji","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.16137v3.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.03748v1","updated":"2023-09-07T14:43:17Z","published":"2023-09-07T14:43:17Z","title":"Enhancing Pipeline-Based Conversational Agents with Large Language\n Models","summary":" The latest advancements in AI and deep learning have led to a breakthrough in\nlarge language model (LLM)-based agents such as GPT-4. However, many commercial\nconversational agent development tools are pipeline-based and have limitations\nin holding a human-like conversation. This paper investigates the capabilities\nof LLMs to enhance pipeline-based conversational agents during two phases: 1)\nin the design and development phase and 2) during operations. In 1) LLMs can\naid in generating training data, extracting entities and synonyms,\nlocalization, and persona design. In 2) LLMs can assist in contextualization,\nintent classification to prevent conversational breakdown and handle\nout-of-scope questions, auto-correcting utterances, rephrasing responses,\nformulating disambiguation questions, summarization, and enabling closed\nquestion-answering capabilities. We conducted informal experiments with GPT-4\nin the private banking domain to demonstrate the scenarios above with a\npractical example. Companies may be hesitant to replace their pipeline-based\nagents with LLMs entirely due to privacy concerns and the need for deep\nintegration within their existing ecosystems. A hybrid approach in which LLMs'\nare integrated into the pipeline-based agents allows them to save time and\ncosts of building and running agents by capitalizing on the capabilities of\nLLMs while retaining the integration and privacy safeguards of their existing\nsystems.\n","authors":["Mina Foosherian","Hendrik Purwins","Purna Rathnayake","Touhidul Alam","Rui Teimao","Klaus-Dieter Thoben"],"pdf_url":"https://arxiv.org/pdf/2309.03748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03747v1","updated":"2023-09-07T14:42:35Z","published":"2023-09-07T14:42:35Z","title":"The Daunting Dilemma with Sentence Encoders: Success on Standard\n Benchmarks, Failure in Capturing Basic Semantic Properties","summary":" In this paper, we adopted a retrospective approach to examine and compare\nfive existing popular sentence encoders, i.e., Sentence-BERT, Universal\nSentence Encoder (USE), LASER, InferSent, and Doc2vec, in terms of their\nperformance on downstream tasks versus their capability to capture basic\nsemantic properties. Initially, we evaluated all five sentence encoders on the\npopular SentEval benchmark and found that multiple sentence encoders perform\nquite well on a variety of popular downstream tasks. However, being unable to\nfind a single winner in all cases, we designed further experiments to gain a\ndeeper understanding of their behavior. Specifically, we proposed four semantic\nevaluation criteria, i.e., Paraphrasing, Synonym Replacement, Antonym\nReplacement, and Sentence Jumbling, and evaluated the same five sentence\nencoders using these criteria. We found that the Sentence-Bert and USE models\npass the paraphrasing criterion, with SBERT being the superior between the two.\nLASER dominates in the case of the synonym replacement criterion.\nInterestingly, all the sentence encoders failed the antonym replacement and\njumbling criteria. These results suggest that although these popular sentence\nencoders perform quite well on the SentEval benchmark, they still struggle to\ncapture some basic semantic properties, thus, posing a daunting dilemma in NLP\nresearch.\n","authors":["Yash Mahajan","Naman Bansal","Shubhra Kanti Karmaker"],"pdf_url":"https://arxiv.org/pdf/2309.03747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03713v1","updated":"2023-09-07T13:42:05Z","published":"2023-09-07T13:42:05Z","title":"Word segmentation granularity in Korean","summary":" This paper describes word {segmentation} granularity in Korean language\nprocessing. From a word separated by blank space, which is termed an eojeol, to\na sequence of morphemes in Korean, there are multiple possible levels of word\nsegmentation granularity in Korean. For specific language processing and corpus\nannotation tasks, several different granularity levels have been proposed and\nutilized, because the agglutinative languages including Korean language have a\none-to-one mapping between functional morpheme and syntactic category. Thus, we\nanalyze these different granularity levels, presenting the examples of Korean\nlanguage processing systems for future reference. Interestingly, the\ngranularity by separating only functional morphemes including case markers and\nverbal endings, and keeping other suffixes for morphological derivation results\nin the optimal performance for phrase structure parsing. This contradicts\nprevious best practices for Korean language processing, which has been the de\nfacto standard for various applications that require separating all morphemes.\n","authors":["Jungyeul Park","Mija Kim"],"pdf_url":"https://arxiv.org/pdf/2309.03713v1.pdf","comment":"Accepted for publication in Korean Linguistics (Benjamins)"},{"id":"http://arxiv.org/abs/2212.05798v2","updated":"2023-09-07T12:22:43Z","published":"2022-12-12T09:49:02Z","title":"BigText-QA: Question Answering over a Large-Scale Hybrid Knowledge Graph","summary":" Answering complex questions over textual resources remains a challenge,\nparticularly when dealing with nuanced relationships between multiple entities\nexpressed within natural-language sentences. To this end, curated knowledge\nbases (KBs) like YAGO, DBpedia, Freebase, and Wikidata have been widely used\nand gained great acceptance for question-answering (QA) applications in the\npast decade. While these KBs offer a structured knowledge representation, they\nlack the contextual diversity found in natural-language sources. To address\nthis limitation, BigText-QA introduces an integrated QA approach, which is able\nto answer questions based on a more redundant form of a knowledge graph (KG)\nthat organizes both structured and unstructured (i.e., \"hybrid\") knowledge in a\nunified graphical representation. Thereby, BigText-QA is able to combine the\nbest of both worlds$\\unicode{x2013}$a canonical set of named entities, mapped\nto a structured background KB (such as YAGO or Wikidata), as well as an open\nset of textual clauses providing highly diversified relational paraphrases with\nrich context information. Our experimental results demonstrate that BigText-QA\noutperforms DrQA, a neural-network-based QA system, and achieves competitive\nresults to QUEST, a graph-based unsupervised QA system.\n","authors":["Jingjing Xu","Maria Biryukov","Martin Theobald","Vinu Ellampallil Venugopal"],"pdf_url":"https://arxiv.org/pdf/2212.05798v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05301v2","updated":"2023-09-07T12:20:45Z","published":"2023-06-08T15:46:32Z","title":"ToolAlpaca: Generalized Tool Learning for Language Models with 3000\n Simulated Cases","summary":" Enabling large language models to utilize real-world tools effectively is\ncrucial for achieving embodied intelligence. Existing approaches to tool\nlearning have either primarily relied on extremely large language models, such\nas GPT-4, to attain generalized tool-use abilities in a zero-shot manner, or\nutilized supervised learning to train limited scopes of tools on compact\nmodels. However, it remains uncertain whether smaller language models can\nachieve generalized tool-use abilities without tool-specific training. To\naddress this question, this paper introduces ToolAlpaca, a novel framework\ndesigned to automatically generate a diverse tool-use corpus and learn\ngeneralized tool-use abilities on compact language models with minimal human\nintervention. Specifically, ToolAlpaca first automatically creates a highly\ndiversified tool-use corpus by building a multi-agent simulation environment.\nThe corpus contains 3938 tool-use instances from more than 400 real-world tool\nAPIs spanning 50 distinct categories. Subsequently, the constructed corpus is\nemployed to fine-tune compact language models, resulting in two models, namely\nToolAlpaca-7B and ToolAlpaca-13B, respectively. Finally, we evaluate the\nability of these models to utilize previously unseen tools without specific\ntraining. Experimental results demonstrate that ToolAlpaca achieves effective\ngeneralized tool-use capabilities comparable to those of extremely large\nlanguage models like GPT-3.5, demonstrating that learning generalized tool-use\nability is feasible for compact language models.\n","authors":["Qiaoyu Tang","Ziliang Deng","Hongyu Lin","Xianpei Han","Qiao Liang","Boxi Cao","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2306.05301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03667v1","updated":"2023-09-07T12:10:47Z","published":"2023-09-07T12:10:47Z","title":"Exploring an LM to generate Prolog Predicates from Mathematics Questions","summary":" Recently, there has been a surge in interest in NLP driven by ChatGPT.\nChatGPT, a transformer-based generative language model of substantial scale,\nexhibits versatility in performing various tasks based on natural language.\nNevertheless, large language models often exhibit poor performance in solving\nmathematics questions that require reasoning. Prior research has demonstrated\nthe effectiveness of chain-of-thought prompting in enhancing reasoning\ncapabilities. Now, we aim to investigate whether fine-tuning a model for the\ngeneration of Prolog codes, a logic language, and subsequently passing these\ncodes to a compiler can further improve accuracy. Consequently, we employ\nchain-of-thought to fine-tune LLaMA7B as a baseline model and develop other\nfine-tuned LLaMA7B models for the generation of Prolog code, Prolog code +\nchain-of-thought, and chain-of-thought + Prolog code, respectively. The results\nreveal that the Prolog generation model surpasses the baseline in performance,\nwhile the combination generation models do not yield significant improvements.\nThe Prolog corpus based on GSM8K and the correspondingly finetuned Prolog\ngeneration model based on LLaMA7B are released to the research community.\n","authors":["Xiaocheng Yang","Yik-Cheung Tam"],"pdf_url":"https://arxiv.org/pdf/2309.03667v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.03658v1","updated":"2023-09-07T11:55:11Z","published":"2023-09-07T11:55:11Z","title":"BNS-Net: A Dual-channel Sarcasm Detection Method Considering\n Behavior-level and Sentence-level Conflicts","summary":" Sarcasm detection is a binary classification task that aims to determine\nwhether a given utterance is sarcastic. Over the past decade, sarcasm detection\nhas evolved from classical pattern recognition to deep learning approaches,\nwhere features such as user profile, punctuation and sentiment words have been\ncommonly employed for sarcasm detection. In real-life sarcastic expressions,\nbehaviors without explicit sentimental cues often serve as carriers of implicit\nsentimental meanings. Motivated by this observation, we proposed a dual-channel\nsarcasm detection model named BNS-Net. The model considers behavior and\nsentence conflicts in two channels. Channel 1: Behavior-level Conflict Channel\nreconstructs the text based on core verbs while leveraging the modified\nattention mechanism to highlight conflict information. Channel 2:\nSentence-level Conflict Channel introduces external sentiment knowledge to\nsegment the text into explicit and implicit sentences, capturing conflicts\nbetween them. To validate the effectiveness of BNS-Net, several comparative and\nablation experiments are conducted on three public sarcasm datasets. The\nanalysis and evaluation of experimental results demonstrate that the BNS-Net\neffectively identifies sarcasm in text and achieves the state-of-the-art\nperformance.\n","authors":["Liming Zhou","Xiaowei Xu","Xiaodong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03658v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.06599v3","updated":"2023-09-07T11:39:07Z","published":"2023-05-11T06:43:37Z","title":"Structured Chain-of-Thought Prompting for Code Generation","summary":" Large Language Models (LLMs) (e.g., ChatGPT) have shown impressive\nperformance in code generation. LLMs take prompts as inputs, and\nChain-of-Thought (CoT) prompting is the state-of-the-art prompting technique.\nCoT prompting asks LLMs first to generate CoTs (i.e., intermediate natural\nlanguage reasoning steps) and then output the code. However, CoT prompting is\ndesigned for natural language generation and has low accuracy in code\ngeneration.\n In this paper, we propose Structured CoTs (SCoTs) and present a novel\nprompting technique for code generation, named SCoT prompting. Our motivation\nis source code contains rich structural information and any code can be\ncomposed of three program structures (i.e., sequence, branch, and loop\nstructures). Intuitively, structured intermediate reasoning steps make for\nstructured source code. Thus, we ask LLMs to use program structures to build\nCoTs, obtaining SCoTs. Then, LLMs generate the final code based on SCoTs.\nCompared to CoT prompting, SCoT prompting explicitly constrains LLMs to think\nabout how to solve requirements from the view of source code and further the\nperformance of LLMs in code generation. We apply SCoT prompting to two LLMs\n(i.e., ChatGPT and Codex) and evaluate it on three benchmarks (i.e., HumanEval,\nMBPP, and MBCPP). (1) SCoT prompting outperforms the state-of-the-art baseline\n- CoT prompting by up to 13.79% in Pass@1. (2) Human evaluation shows human\ndevelopers prefer programs from SCoT prompting. (3) SCoT prompting is robust to\nexamples and achieves substantial improvements.\n","authors":["Jia Li","Ge Li","Yongmin Li","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2305.06599v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.17780"},{"id":"http://arxiv.org/abs/2308.13754v2","updated":"2023-09-07T11:22:59Z","published":"2023-08-26T03:48:10Z","title":"ZC3: Zero-Shot Cross-Language Code Clone Detection","summary":" Developers introduce code clones to improve programming productivity. Many\nexisting studies have achieved impressive performance in monolingual code clone\ndetection. However, during software development, more and more developers write\nsemantically equivalent programs with different languages to support different\nplatforms and help developers translate projects from one language to another.\nConsidering that collecting cross-language parallel data, especially for\nlow-resource languages, is expensive and time-consuming, how designing an\neffective cross-language model that does not rely on any parallel data is a\nsignificant problem. In this paper, we propose a novel method named ZC3 for\nZero-shot Cross-language Code Clone detection. ZC3 designs the contrastive\nsnippet prediction to form an isomorphic representation space among different\nprogramming languages. Based on this, ZC3 exploits domain-aware learning and\ncycle consistency learning to further constrain the model to generate\nrepresentations that are aligned among different languages meanwhile are\ndiacritical for different types of clones. To evaluate our approach, we conduct\nextensive experiments on four representative cross-language clone detection\ndatasets. Experimental results show that ZC3 outperforms the state-of-the-art\nbaselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively.\nWe further investigate the representational distribution of different languages\nand discuss the effectiveness of our method.\n","authors":["Jia Li","Chongyang Tao","Zhi Jin","Fang Liu","Jia Li","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2308.13754v2.pdf","comment":"Accepted by the 38th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2023)"},{"id":"http://arxiv.org/abs/2308.13775v2","updated":"2023-09-07T11:19:30Z","published":"2023-08-26T05:48:57Z","title":"EditSum: A Retrieve-and-Edit Framework for Source Code Summarization","summary":" Existing studies show that code summaries help developers understand and\nmaintain source code. Unfortunately, these summaries are often missing or\noutdated in software projects. Code summarization aims to generate natural\nlanguage descriptions automatically for source code. Code summaries are highly\nstructured and have repetitive patterns. Besides the patternized words, a code\nsummary also contains important keywords, which are the key to reflecting the\nfunctionality of the code. However, the state-of-the-art approaches perform\npoorly on predicting the keywords, which leads to the generated summaries\nsuffering a loss in informativeness. To alleviate this problem, this paper\nproposes a novel retrieve-and-edit approach named EditSum for code\nsummarization. Specifically, EditSum first retrieves a similar code snippet\nfrom a pre-defined corpus and treats its summary as a prototype summary to\nlearn the pattern. Then, EditSum edits the prototype automatically to combine\nthe pattern in the prototype with the semantic information of input code. Our\nmotivation is that the retrieved prototype provides a good start-point for\npost-generation because the summaries of similar code snippets often have the\nsame pattern. The post-editing process further reuses the patternized words in\nthe prototype and generates keywords based on the semantic information of input\ncode. We conduct experiments on a large-scale Java corpus and experimental\nresults demonstrate that EditSum outperforms the state-of-the-art approaches by\na substantial margin. The human evaluation also proves the summaries generated\nby EditSum are more informative and useful. We also verify that EditSum\nperforms well on predicting the patternized words and keywords.\n","authors":["Jia Li","Yongmin Li","Ge Li","Xing Hu","Xin Xia","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2308.13775v2.pdf","comment":"Accepted by the 36th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2021)"},{"id":"http://arxiv.org/abs/2309.03613v1","updated":"2023-09-07T10:13:09Z","published":"2023-09-07T10:13:09Z","title":"Evaluating ChatGPT as a Recommender System: A Rigorous Approach","summary":" Recent popularity surrounds large AI language models due to their impressive\nnatural language capabilities. They contribute significantly to\nlanguage-related tasks, including prompt-based learning, making them valuable\nfor various specific tasks. This approach unlocks their full potential,\nenhancing precision and generalization. Research communities are actively\nexploring their applications, with ChatGPT receiving recognition. Despite\nextensive research on large language models, their potential in recommendation\nscenarios still needs to be explored. This study aims to fill this gap by\ninvestigating ChatGPT's capabilities as a zero-shot recommender system. Our\ngoals include evaluating its ability to use user preferences for\nrecommendations, reordering existing recommendation lists, leveraging\ninformation from similar users, and handling cold-start situations. We assess\nChatGPT's performance through comprehensive experiments using three datasets\n(MovieLens Small, Last.FM, and Facebook Book). We compare ChatGPT's performance\nagainst standard recommendation algorithms and other large language models,\nsuch as GPT-3.5 and PaLM-2. To measure recommendation effectiveness, we employ\nwidely-used evaluation metrics like Mean Average Precision (MAP), Recall,\nPrecision, F1, normalized Discounted Cumulative Gain (nDCG), Item Coverage,\nExpected Popularity Complement (EPC), Average Coverage of Long Tail (ACLT),\nAverage Recommendation Popularity (ARP), and Popularity-based Ranking-based\nEqual Opportunity (PopREO). Through thoroughly exploring ChatGPT's abilities in\nrecommender systems, our study aims to contribute to the growing body of\nresearch on the versatility and potential applications of large language\nmodels. Our experiment code is available on the GitHub repository:\nhttps://github.com/sisinflab/Recommender-ChatGPT\n","authors":["Dario Di Palma","Giovanni Maria Biancofiore","Vito Walter Anelli","Fedelucio Narducci","Tommaso Di Noia","Eugenio Di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2309.03613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03595v1","updated":"2023-09-07T09:40:12Z","published":"2023-09-07T09:40:12Z","title":"Loquacity and Visible Emotion: ChatGPT as a Policy Advisor","summary":" ChatGPT, a software seeking to simulate human conversational abilities, is\nattracting increasing attention. It is sometimes portrayed as a groundbreaking\nproductivity aid, including for creative work. In this paper, we run an\nexperiment to assess its potential in complex writing tasks. We ask the\nsoftware to compose a policy brief for the Board of the Bank of Italy. We find\nthat ChatGPT can accelerate workflows by providing well-structured content\nsuggestions, and by producing extensive, linguistically correct text in a\nmatter of seconds. It does, however, require a significant amount of expert\nsupervision, which partially offsets productivity gains. If the app is used\nnaively, output can be incorrect, superficial, or irrelevant. Superficiality is\nan especially problematic limitation in the context of policy advice intended\nfor high-level audiences.\n","authors":["Claudia Biancotti","Carolina Camassa"],"pdf_url":"https://arxiv.org/pdf/2309.03595v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2308.16763v2","updated":"2023-09-07T09:15:24Z","published":"2023-08-31T14:31:48Z","title":"Ladder-of-Thought: Using Knowledge as Steps to Elevate Stance Detection","summary":" Stance detection aims to identify the attitude expressed in a document\ntowards a given target. Techniques such as Chain-of-Thought (CoT) prompting\nhave advanced this task, enhancing a model's reasoning capabilities through the\nderivation of intermediate rationales. However, CoT relies primarily on a\nmodel's pre-trained internal knowledge during reasoning, thereby neglecting the\nvaluable external information that is previously unknown to the model. This\nomission, especially within the unsupervised reasoning process, can affect the\nmodel's overall performance. Moreover, while CoT enhances Large Language Models\n(LLMs), smaller LMs, though efficient operationally, face challenges in\ndelivering nuanced reasoning. In response to these identified gaps, we\nintroduce the Ladder-of-Thought (LoT) for the stance detection task.\nConstructed through a dual-phase Progressive Optimization Framework, LoT\ndirects the small LMs to assimilate high-quality external knowledge, refining\nthe intermediate rationales produced. These bolstered rationales subsequently\nserve as the foundation for more precise predictions - akin to how a ladder\nfacilitates reaching elevated goals. LoT achieves a balance between efficiency\nand performance. Our empirical evaluations underscore LoT's efficacy, marking a\n16% improvement over GPT-3.5 and a 10% enhancement compared to GPT-3.5 with CoT\non stance detection task.\n","authors":["Kairui Hu","Ming Yan","Joey Tianyi Zhou","Ivor W. Tsang","Wen Haw Chong","Yong Keong Yap"],"pdf_url":"https://arxiv.org/pdf/2308.16763v2.pdf","comment":"5 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2212.08913v2","updated":"2023-09-07T09:01:20Z","published":"2022-12-17T16:30:27Z","title":"Claim Optimization in Computational Argumentation","summary":" An optimal delivery of arguments is key to persuasion in any debate, both for\nhumans and for AI systems. This requires the use of clear and fluent claims\nrelevant to the given debate. Prior work has studied the automatic assessment\nof argument quality extensively. Yet, no approach actually improves the quality\nso far. To fill this gap, this paper proposes the task of claim optimization:\nto rewrite argumentative claims in order to optimize their delivery. As\nmultiple types of optimization are possible, we approach this task by first\ngenerating a diverse set of candidate claims using a large language model, such\nas BART, taking into account contextual information. Then, the best candidate\nis selected using various quality metrics. In automatic and human evaluation on\nan English-language corpus, our quality-based candidate selection outperforms\nseveral baselines, improving 60% of all claims (worsening 16% only). Follow-up\nanalyses reveal that, beyond copy editing, our approach often specifies claims\nwith details, whereas it adds less evidence than humans do. Moreover, its\ncapabilities generalize well to other domains, such as instructional texts.\n","authors":["Gabriella Skitalinskaya","Maximilian Spliethöver","Henning Wachsmuth"],"pdf_url":"https://arxiv.org/pdf/2212.08913v2.pdf","comment":"Accepted as a long paper at INLG 2023"},{"id":"http://arxiv.org/abs/2309.03564v1","updated":"2023-09-07T08:50:46Z","published":"2023-09-07T08:50:46Z","title":"Evaluating the Efficacy of Supervised Learning vs Large Language Models\n for Identifying Cognitive Distortions and Suicidal Risks in Chinese Social\n Media","summary":" Large language models, particularly those akin to the rapidly progressing GPT\nseries, are gaining traction for their expansive influence. While there is keen\ninterest in their applicability within medical domains such as psychology,\ntangible explorations on real-world data remain scant. Concurrently, users on\nsocial media platforms are increasingly vocalizing personal sentiments; under\nspecific thematic umbrellas, these sentiments often manifest as negative\nemotions, sometimes escalating to suicidal inclinations. Timely discernment of\nsuch cognitive distortions and suicidal risks is crucial to effectively\nintervene and potentially avert dire circumstances. Our study ventured into\nthis realm by experimenting on two pivotal tasks: suicidal risk and cognitive\ndistortion identification on Chinese social media platforms. Using supervised\nlearning as a baseline, we examined and contrasted the efficacy of large\nlanguage models via three distinct strategies: zero-shot, few-shot, and\nfine-tuning. Our findings revealed a discernible performance gap between the\nlarge language models and traditional supervised learning approaches, primarily\nattributed to the models' inability to fully grasp subtle categories. Notably,\nwhile GPT-4 outperforms its counterparts in multiple scenarios, GPT-3.5 shows\nsignificant enhancement in suicide risk classification after fine-tuning. To\nour knowledge, this investigation stands as the maiden attempt at gauging large\nlanguage models on Chinese social media tasks. This study underscores the\nforward-looking and transformative implications of using large language models\nin the field of psychology. It lays the groundwork for future applications in\npsychological research and practice.\n","authors":["Hongzhi Qi","Qing Zhao","Changwei Song","Wei Zhai","Dan Luo","Shuo Liu","Yi Jing Yu","Fan Wang","Huijing Zou","Bing Xiang Yang","Jianqiang Li","Guanghui Fu"],"pdf_url":"https://arxiv.org/pdf/2309.03564v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2309.03563v1","updated":"2023-09-07T08:50:45Z","published":"2023-09-07T08:50:45Z","title":"All Labels Together: Low-shot Intent Detection with an Efficient Label\n Semantic Encoding Paradigm","summary":" In intent detection tasks, leveraging meaningful semantic information from\nintent labels can be particularly beneficial for few-shot scenarios. However,\nexisting few-shot intent detection methods either ignore the intent labels,\n(e.g. treating intents as indices) or do not fully utilize this information\n(e.g. only using part of the intent labels). In this work, we present an\nend-to-end One-to-All system that enables the comparison of an input utterance\nwith all label candidates. The system can then fully utilize label semantics in\nthis way. Experiments on three few-shot intent detection tasks demonstrate that\nOne-to-All is especially effective when the training resource is extremely\nscarce, achieving state-of-the-art performance in 1-, 3- and 5-shot settings.\nMoreover, we present a novel pretraining strategy for our model that utilizes\nindirect supervision from paraphrasing, enabling zero-shot cross-domain\ngeneralization on intent detection tasks. Our code is at\nhttps://github.com/jiangshdd/AllLablesTogethe.\n","authors":["Jiangshu Du","Congying Xia","Wenpeng Yin","Tingting Liang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2309.03563v1.pdf","comment":"Accepted by IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2309.03559v1","updated":"2023-09-07T08:42:40Z","published":"2023-09-07T08:42:40Z","title":"An Anchor Learning Approach for Citation Field Learning","summary":" Citation field learning is to segment a citation string into fields of\ninterest such as author, title, and venue. Extracting such fields from\ncitations is crucial for citation indexing, researcher profile analysis, etc.\nUser-generated resources like academic homepages and Curriculum Vitae, provide\nrich citation field information. However, extracting fields from these\nresources is challenging due to inconsistent citation styles, incomplete\nsentence syntax, and insufficient training data. To address these challenges,\nwe propose a novel algorithm, CIFAL (citation field learning by anchor\nlearning), to boost the citation field learning performance. CIFAL leverages\nthe anchor learning, which is model-agnostic for any Pre-trained Language\nModel, to help capture citation patterns from the data of different citation\nstyles. The experiments demonstrate that CIFAL outperforms state-of-the-art\nmethods in citation field learning, achieving a 2.83% improvement in\nfield-level F1-scores. Extensive analysis of the results further confirms the\neffectiveness of CIFAL quantitatively and qualitatively.\n","authors":["Zilin Yuan","Borun Chen","Yimeng Dai","Yinghui Li","Hai-Tao Zheng","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.03559v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2306.00526v4","updated":"2023-09-07T08:40:16Z","published":"2023-06-01T10:28:12Z","title":"Layout and Task Aware Instruction Prompt for Zero-shot Document Image\n Question Answering","summary":" Layout-aware pre-trained models has achieved significant progress on document\nimage question answering. They introduce extra learnable modules into existing\nlanguage models to capture layout information within document images from text\nbounding box coordinates obtained by OCR tools. However, extra modules\nnecessitate pre-training on extensive document images. This prevents these\nmethods from directly utilizing off-the-shelf instruction-tuning language\nfoundation models, which have recently shown promising potential in zero-shot\nlearning. Instead, in this paper, we find that instruction-tuning language\nmodels like Claude and ChatGPT can understand layout by spaces and line breaks.\nBased on this observation, we propose the LAyout and Task aware Instruction\nPrompt (LATIN-Prompt), which consists of layout-aware document content and\ntask-aware instruction. Specifically, the former uses appropriate spaces and\nline breaks to recover the layout information among text segments obtained by\nOCR tools, and the latter ensures that generated answers adhere to formatting\nrequirements. Moreover, we propose the LAyout and Task aware Instruction Tuning\n(LATIN-Tuning) to improve the performance of small instruction-tuning models\nlike Alpaca. Experimental results show that LATIN-Prompt enables zero-shot\nperformance of Claude and ChatGPT to be comparable to the fine-tuning\nperformance of SOTAs on document image question answering, and LATIN-Tuning\nenhances the zero-shot performance of Alpaca significantly. For example,\nLATIN-Prompt improves the performance of Claude and ChatGPT on DocVQA by 263%\nand 20% respectively. LATIN-Tuning improves the performance of Alpaca on DocVQA\nby 87.7%. Quantitative and qualitative analyses demonstrate the effectiveness\nof LATIN-Prompt and LATIN-Tuning. We provide the code in supplementary and will\nrelease it to facilitate future research.\n","authors":["Wenjin Wang","Yunhao Li","Yixin Ou","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.00526v4.pdf","comment":"Add the LATIN-Tuning for Alapca. Code is available at\n https://github.com/WenjinW/LATIN-Prompt"},{"id":"http://arxiv.org/abs/2308.11432v2","updated":"2023-09-07T04:42:48Z","published":"2023-08-22T13:30:37Z","title":"A Survey on Large Language Model based Autonomous Agents","summary":" Autonomous agents have long been a prominent research focus in both academic\nand industry communities. Previous research in this field often focuses on\ntraining agents with limited knowledge within isolated environments, which\ndiverges significantly from human learning processes, and thus makes the agents\nhard to achieve human-like decisions. Recently, through the acquisition of vast\namounts of web knowledge, large language models (LLMs) have demonstrated\nremarkable potential in achieving human-level intelligence. This has sparked an\nupsurge in studies investigating LLM-based autonomous agents. In this paper, we\npresent a comprehensive survey of these studies, delivering a systematic review\nof the field of LLM-based autonomous agents from a holistic perspective. More\nspecifically, we first discuss the construction of LLM-based autonomous agents,\nfor which we propose a unified framework that encompasses a majority of the\nprevious work. Then, we present a comprehensive overview of the diverse\napplications of LLM-based autonomous agents in the fields of social science,\nnatural science, and engineering. Finally, we delve into the evaluation\nstrategies commonly used for LLM-based autonomous agents. Based on the previous\nstudies, we also present several challenges and future directions in this\nfield. To keep track of this field and continuously update our survey, we\nmaintain a repository of relevant references at\nhttps://github.com/Paitesanshi/LLM-Agent-Survey.\n","authors":["Lei Wang","Chen Ma","Xueyang Feng","Zeyu Zhang","Hao Yang","Jingsen Zhang","Zhiyuan Chen","Jiakai Tang","Xu Chen","Yankai Lin","Wayne Xin Zhao","Zhewei Wei","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.11432v2.pdf","comment":"35 pages, 5 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.11764v3","updated":"2023-09-07T04:16:54Z","published":"2023-08-22T20:12:49Z","title":"Halo: Estimation and Reduction of Hallucinations in Open-Source Weak\n Large Language Models","summary":" Large Language Models (LLMs) have revolutionized Natural Language Processing\n(NLP). Although convenient for research and practical applications, open-source\nLLMs with fewer parameters often suffer from severe hallucinations compared to\ntheir larger counterparts. This paper focuses on measuring and reducing\nhallucinations in BLOOM 7B, a representative of such weaker open-source LLMs\nthat are publicly available for research and commercial applications. We\nintroduce HaloCheck, a lightweight BlackBox knowledge-free framework designed\nto quantify the severity of hallucinations in LLMs. Additionally, we explore\ntechniques like knowledge injection and teacher-student approaches to alleviate\nhallucinations in low-parameter LLMs. Our experiments effectively demonstrate\nthe reduction of hallucinations in challenging domains for these LLMs.\n","authors":["Mohamed Elaraby","Mengyin Lu","Jacob Dunn","Xueying Zhang","Yu Wang","Shizhu Liu","Pingchuan Tian","Yuping Wang","Yuxuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11764v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03470v1","updated":"2023-09-07T04:04:01Z","published":"2023-09-07T04:04:01Z","title":"Machine Learning for Tangible Effects: Natural Language Processing for\n Uncovering the Illicit Massage Industry & Computer Vision for Tactile Sensing","summary":" I explore two questions in this thesis: how can computer science be used to\nfight human trafficking? And how can computer vision create a sense of touch?\n I use natural language processing (NLP) to monitor the United States illicit\nmassage industry (IMI), a multi-billion dollar industry that offers not just\ntherapeutic massages but also commercial sexual services. Employees of this\nindustry are often immigrant women with few job opportunities, leaving them\nvulnerable to fraud, coercion, and other facets of human trafficking.\nMonitoring spatiotemporal trends helps prevent trafficking in the IMI. By\ncreating datasets with three publicly-accessible websites: Google Places,\nRubmaps, and AMPReviews, combined with NLP techniques such as bag-of-words and\nWord2Vec, I show how to derive insights into the labor pressures and language\nbarriers that employees face, as well as the income, demographics, and societal\npressures affecting sex buyers. I include a call-to-action to other researchers\ngiven these datasets. I also consider how to creating synthetic financial data,\nwhich can aid with counter-trafficking in the banking sector. I use an\nagent-based model to create both tabular and payee-recipient graph data.\n I then consider the role of computer vision in making tactile sensors. I\nreport on a novel sensor, the Digger Finger, that adapts the Gelsight sensor to\nfinding objects in granular media. Changes include using a wedge shape to\nfacilitate digging, replacing the internal lighting LEDs with fluorescent\npaint, and adding a vibrator motor to counteract jamming. Finally, I also show\nhow to use a webcam and a printed reference marker, or fiducial, to create a\nlow-cost six-axis force-torque sensor. This sensor is up to a hundred times\nless expensive than commercial sensors, allowing for a wider range of\napplications. For this and earlier chapters I release design files and code as\nopen source.\n","authors":["Rui Ouyang"],"pdf_url":"https://arxiv.org/pdf/2309.03470v1.pdf","comment":"PhD thesis"},{"id":"http://arxiv.org/abs/2303.13592v3","updated":"2023-09-07T03:20:41Z","published":"2023-03-23T18:16:30Z","title":"Prompting Multilingual Large Language Models to Generate Code-Mixed\n Texts: The Case of South East Asian Languages","summary":" While code-mixing is a common linguistic practice in many parts of the world,\ncollecting high-quality and low-cost code-mixed data remains a challenge for\nnatural language processing (NLP) research. The recent proliferation of Large\nLanguage Models (LLMs) compels one to ask: how capable are these systems in\ngenerating code-mixed data? In this paper, we explore prompting multilingual\nLLMs in a zero-shot manner to generate code-mixed data for seven languages in\nSouth East Asia (SEA), namely Indonesian, Malay, Chinese, Tagalog, Vietnamese,\nTamil, and Singlish. We find that publicly available multilingual\ninstruction-tuned models such as BLOOMZ and Flan-T5-XXL are incapable of\nproducing texts with phrases or clauses from different languages. ChatGPT\nexhibits inconsistent capabilities in generating code-mixed texts, wherein its\nperformance varies depending on the prompt template and language pairing. For\ninstance, ChatGPT generates fluent and natural Singlish texts (an English-based\ncreole spoken in Singapore), but for English-Tamil language pair, the system\nmostly produces grammatically incorrect or semantically meaningless utterances.\nFurthermore, it may erroneously introduce languages not specified in the\nprompt. Based on our investigation, existing multilingual LLMs exhibit a wide\nrange of proficiency in code-mixed data generation for SEA languages. As such,\nwe advise against using LLMs in this context without extensive human checks.\n","authors":["Zheng-Xin Yong","Ruochen Zhang","Jessica Zosa Forde","Skyler Wang","Samuel Cahyawijaya","Holy Lovenia","Genta Indra Winata","Lintang Sutawika","Jan Christian Blaise Cruz","Long Phan","Yin Lin Tan","Thamar Solorio","Alham Fikri Aji"],"pdf_url":"https://arxiv.org/pdf/2303.13592v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03450v1","updated":"2023-09-07T02:20:03Z","published":"2023-09-07T02:20:03Z","title":"XGen-7B Technical Report","summary":" Large Language Models (LLMs) have become ubiquitous across various domains,\ntransforming the way we interact with information and conduct research.\nHowever, most high-performing LLMs remain confined behind proprietary walls,\nhindering scientific progress. Most open-source LLMs, on the other hand, are\nlimited in their ability to support longer sequence lengths, which is a key\nrequirement for many tasks that require inference over an input context. To\naddress this, we have trained XGen, a series of 7B parameter models on up to 8K\nsequence length for up to 1.5T tokens. We have also finetuned the XGen models\non public-domain instructional data, creating their instruction-tuned\ncounterparts (XGen-Inst). We open-source our models for both research\nadvancements and commercial applications. Our evaluation on standard benchmarks\nshows that XGen models achieve comparable or better results when compared with\nstate-of-the-art open-source LLMs. Our targeted evaluation on long sequence\nmodeling tasks shows the benefits of our 8K-sequence models over 2K-sequence\nopen-source LLMs.\n","authors":["Erik Nijkamp","Tian Xie","Hiroaki Hayashi","Bo Pang","Congying Xia","Chen Xing","Jesse Vig","Semih Yavuz","Philippe Laban","Ben Krause","Senthil Purushwalkam","Tong Niu","Wojciech Kryściński","Lidiya Murakhovs'ka","Prafulla Kumar Choubey","Alex Fabbri","Ye Liu","Rui Meng","Lifu Tu","Meghana Bhat","Chien-Sheng Wu","Silvio Savarese","Yingbo Zhou","Shafiq Joty","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2309.03450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02884v2","updated":"2023-09-07T01:52:33Z","published":"2023-09-06T10:20:06Z","title":"Aligning Large Language Models for Clinical Tasks","summary":" Large Language Models (LLMs) have demonstrated remarkable adaptability,\nshowcasing their capacity to excel in tasks for which they were not explicitly\ntrained. However, despite their impressive natural language processing (NLP)\ncapabilities, effective alignment of LLMs remains a crucial challenge when\ndeploying them for specific clinical applications. The ability to generate\nresponses with factually accurate content and to engage in non-trivial\nreasoning steps are crucial for the LLMs to be eligible for applications in\nclinical medicine. Employing a combination of techniques including\ninstruction-tuning and in-prompt strategies like few-shot and chain-of-thought\nprompting has significantly enhanced the performance of LLMs. Our proposed\nalignment strategy for medical question-answering, known as\n'expand-guess-refine', offers a parameter and data-efficient solution. A\npreliminary analysis of this method demonstrated outstanding performance,\nachieving a score of 70.63% on a subset of questions sourced from the USMLE\ndataset.\n","authors":["Supun Manathunga","Isuru Hettigoda"],"pdf_url":"https://arxiv.org/pdf/2309.02884v2.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.01398v2","updated":"2023-09-07T01:36:08Z","published":"2023-09-04T07:00:26Z","title":"Zero-shot information extraction from radiological reports using ChatGPT","summary":" Electronic health records contain an enormous amount of valuable information,\nbut many are recorded in free text. Information extraction is the strategy to\ntransform the sequence of characters into structured data, which can be\nemployed for secondary analysis. However, the traditional information\nextraction components, such as named entity recognition and relation\nextraction, require annotated data to optimize the model parameters, which has\nbecome one of the major bottlenecks in building information extraction systems.\nWith the large language models achieving good performances on various\ndownstream NLP tasks without parameter tuning, it becomes possible to use large\nlanguage models for zero-shot information extraction. In this study, we aim to\nexplore whether the most popular large language model, ChatGPT, can extract\nuseful information from the radiological reports. We first design the prompt\ntemplate for the interested information in the CT reports. Then, we generate\nthe prompts by combining the prompt template with the CT reports as the inputs\nof ChatGPT to obtain the responses. A post-processing module is developed to\ntransform the responses into structured extraction results. We conducted the\nexperiments with 847 CT reports collected from Peking University Cancer\nHospital. The experimental results indicate that ChatGPT can achieve\ncompetitive performances for some extraction tasks compared with the baseline\ninformation extraction system, but some limitations need to be further\nimproved.\n","authors":["Danqing Hu","Bing Liu","Xiaofeng Zhu","Xudong Lu","Nan Wu"],"pdf_url":"https://arxiv.org/pdf/2309.01398v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03433v1","updated":"2023-09-07T01:35:24Z","published":"2023-09-07T01:35:24Z","title":"Improving Open Information Extraction with Large Language Models: A\n Study on Demonstration Uncertainty","summary":" Open Information Extraction (OIE) task aims at extracting structured facts\nfrom unstructured text, typically in the form of (subject, relation, object)\ntriples. Despite the potential of large language models (LLMs) like ChatGPT as\na general task solver, they lag behind state-of-the-art (supervised) methods in\nOIE tasks due to two key issues. First, LLMs struggle to distinguish irrelevant\ncontext from relevant relations and generate structured output due to the\nrestrictions on fine-tuning the model. Second, LLMs generates responses\nautoregressively based on probability, which makes the predicted relations lack\nconfidence. In this paper, we assess the capabilities of LLMs in improving the\nOIE task. Particularly, we propose various in-context learning strategies to\nenhance LLM's instruction-following ability and a demonstration uncertainty\nquantification module to enhance the confidence of the generated relations. Our\nexperiments on three OIE benchmark datasets show that our approach holds its\nown against established supervised methods, both quantitatively and\nqualitatively.\n","authors":["Chen Ling","Xujiang Zhao","Xuchao Zhang","Yanchi Liu","Wei Cheng","Haoyu Wang","Zhengzhang Chen","Takao Osaki","Katsushi Matsuda","Haifeng Chen","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.03433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02706v2","updated":"2023-09-07T01:01:24Z","published":"2023-09-06T04:38:16Z","title":"HAE-RAE Bench: Evaluation of Korean Knowledge in Language Models","summary":" Large Language Models (LLMs) pretrained on massive corpora exhibit remarkable\ncapabilities across a wide range of tasks, however, the attention given to\nnon-English languages has been limited in this field of research. To address\nthis gap and assess the proficiency of language models in the Korean language\nand culture, we present HAE-RAE Bench, covering 6 tasks including vocabulary,\nhistory, and general knowledge. Our evaluation of language models on this\nbenchmark highlights the potential advantages of employing Large\nLanguage-Specific Models(LLSMs) over a comprehensive, universal model like\nGPT-3.5. Remarkably, our study reveals that models approximately 13 times\nsmaller than GPT-3.5 can exhibit similar performance levels in terms of\nlanguage-specific knowledge retrieval. This observation underscores the\nimportance of homogeneous corpora for training professional-level\nlanguage-specific models. On the contrary, we also observe a perplexing\nperformance dip in these smaller LMs when they are tasked to generate\nstructured answers.\n","authors":["Guijin Son","Hanwool Lee","Suwan Kim","Huiseo Kim","Jaecheol Lee","Je Won Yeom","Jihyu Jung","Jung Woo Kim","Songseong Kim"],"pdf_url":"https://arxiv.org/pdf/2309.02706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02553v2","updated":"2023-09-07T00:23:34Z","published":"2023-09-05T19:40:45Z","title":"Automating Behavioral Testing in Machine Translation","summary":" Behavioral testing in NLP allows fine-grained evaluation of systems by\nexamining their linguistic capabilities through the analysis of input-output\nbehavior. Unfortunately, existing work on behavioral testing in Machine\nTranslation (MT) is currently restricted to largely handcrafted tests covering\na limited range of capabilities and languages. To address this limitation, we\npropose to use Large Language Models (LLMs) to generate a diverse set of source\nsentences tailored to test the behavior of MT models in a range of situations.\nWe can then verify whether the MT model exhibits the expected behavior through\nmatching candidate sets that are also generated using LLMs. Our approach aims\nto make behavioral testing of MT systems practical while requiring only minimal\nhuman effort. In our experiments, we apply our proposed evaluation framework to\nassess multiple available MT systems, revealing that while in general\npass-rates follow the trends observable from traditional accuracy-based\nmetrics, our method was able to uncover several important differences and\npotential bugs that go unnoticed when relying only on accuracy.\n","authors":["Javier Ferrando","Matthias Sperber","Hendra Setiawan","Dominic Telaar","Saša Hasan"],"pdf_url":"https://arxiv.org/pdf/2309.02553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03412v1","updated":"2023-09-07T00:14:37Z","published":"2023-09-07T00:14:37Z","title":"From Base to Conversational: Japanese Instruction Dataset and Tuning\n Large Language Models","summary":" Instruction tuning is essential for large language models (LLMs) to become\ninteractive. While many instruction tuning datasets exist in English, there is\na noticeable lack in other languages. Also, their effectiveness has not been\nwell verified in non-English languages. We construct a Japanese instruction\ndataset by expanding and filtering existing datasets and apply the dataset to a\nJapanese pre-trained base model. We performed Low-Rank Adaptation (LoRA) tuning\non both Japanese and English existing models using our instruction dataset. We\nevaluated these models from both quantitative and qualitative perspectives. As\na result, the effectiveness of Japanese instruction datasets is confirmed. The\nresults also indicate that even with relatively small LLMs, performances in\ndownstream tasks would be improved through instruction tuning. Our instruction\ndataset, tuned models, and implementation are publicly available online.\n","authors":["Masahiro Suzuki","Masanori Hirano","Hiroki Sakaji"],"pdf_url":"https://arxiv.org/pdf/2309.03412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03409v1","updated":"2023-09-07T00:07:15Z","published":"2023-09-07T00:07:15Z","title":"Large Language Models as Optimizers","summary":" Optimization is ubiquitous. While derivative-based algorithms have been\npowerful tools for various problems, the absence of gradient imposes challenges\non many real-world applications. In this work, we propose Optimization by\nPROmpting (OPRO), a simple and effective approach to leverage large language\nmodels (LLMs) as optimizers, where the optimization task is described in\nnatural language. In each optimization step, the LLM generates new solutions\nfrom the prompt that contains previously generated solutions with their values,\nthen the new solutions are evaluated and added to the prompt for the next\noptimization step. We first showcase OPRO on linear regression and traveling\nsalesman problems, then move on to prompt optimization where the goal is to\nfind instructions that maximize the task accuracy. With a variety of LLMs, we\ndemonstrate that the best prompts optimized by OPRO outperform human-designed\nprompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks.\n","authors":["Chengrun Yang","Xuezhi Wang","Yifeng Lu","Hanxiao Liu","Quoc V. Le","Denny Zhou","Xinyun Chen"],"pdf_url":"https://arxiv.org/pdf/2309.03409v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.03905v1","updated":"2023-09-07T17:59:45Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v1.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"},{"id":"http://arxiv.org/abs/2309.03904v1","updated":"2023-09-07T17:59:43Z","published":"2023-09-07T17:59:43Z","title":"Exploring Sparse MoE in GANs for Text-conditioned Image Synthesis","summary":" Due to the difficulty in scaling up, generative adversarial networks (GANs)\nseem to be falling from grace on the task of text-conditioned image synthesis.\nSparsely-activated mixture-of-experts (MoE) has recently been demonstrated as a\nvalid solution to training large-scale models with limited computational\nresources. Inspired by such a philosophy, we present Aurora, a GAN-based\ntext-to-image generator that employs a collection of experts to learn feature\nprocessing, together with a sparse router to help select the most suitable\nexpert for each feature point. To faithfully decode the sampling stochasticity\nand the text condition to the final synthesis, our router adaptively makes its\ndecision by taking into account the text-integrated global latent code. At\n64x64 image resolution, our model trained on LAION2B-en and COYO-700M achieves\n6.2 zero-shot FID on MS COCO. We release the code and checkpoints to facilitate\nthe community for further development.\n","authors":["Jiapeng Zhu","Ceyuan Yang","Kecheng Zheng","Yinghao Xu","Zifan Shi","Yujun Shen"],"pdf_url":"https://arxiv.org/pdf/2309.03904v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2309.03903v1","updated":"2023-09-07T17:59:41Z","published":"2023-09-07T17:59:41Z","title":"Tracking Anything with Decoupled Video Segmentation","summary":" Training data for video segmentation are expensive to annotate. This impedes\nextensions of end-to-end algorithms to new video segmentation tasks, especially\nin large-vocabulary settings. To 'track anything' without training on video\ndata for every individual task, we develop a decoupled video segmentation\napproach (DEVA), composed of task-specific image-level segmentation and\nclass/task-agnostic bi-directional temporal propagation. Due to this design, we\nonly need an image-level model for the target task (which is cheaper to train)\nand a universal temporal propagation model which is trained once and\ngeneralizes across tasks. To effectively combine these two modules, we use\nbi-directional propagation for (semi-)online fusion of segmentation hypotheses\nfrom different frames to generate a coherent segmentation. We show that this\ndecoupled formulation compares favorably to end-to-end approaches in several\ndata-scarce tasks including large-vocabulary video panoptic segmentation,\nopen-world video segmentation, referring video segmentation, and unsupervised\nvideo object segmentation. Code is available at:\nhttps://hkchengrex.github.io/Tracking-Anything-with-DEVA\n","authors":["Ho Kei Cheng","Seoung Wug Oh","Brian Price","Alexander Schwing","Joon-Young Lee"],"pdf_url":"https://arxiv.org/pdf/2309.03903v1.pdf","comment":"Accepted to ICCV 2023. Project page:\n https://hkchengrex.github.io/Tracking-Anything-with-DEVA"},{"id":"http://arxiv.org/abs/2309.03900v1","updated":"2023-09-07T17:59:03Z","published":"2023-09-07T17:59:03Z","title":"Learning Continuous Exposure Value Representations for Single-Image HDR\n Reconstruction","summary":" Deep learning is commonly used to reconstruct HDR images from LDR images. LDR\nstack-based methods are used for single-image HDR reconstruction, generating an\nHDR image from a deep learning-generated LDR stack. However, current methods\ngenerate the stack with predetermined exposure values (EVs), which may limit\nthe quality of HDR reconstruction. To address this, we propose the continuous\nexposure value representation (CEVR), which uses an implicit function to\ngenerate LDR images with arbitrary EVs, including those unseen during training.\nOur approach generates a continuous stack with more images containing diverse\nEVs, significantly improving HDR reconstruction. We use a cycle training\nstrategy to supervise the model in generating continuous EV LDR images without\ncorresponding ground truths. Our CEVR model outperforms existing methods, as\ndemonstrated by experimental results.\n","authors":["Su-Kai Chen","Hung-Lin Yen","Yu-Lun Liu","Min-Hung Chen","Hou-Ning Hu","Wen-Hsiao Peng","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2309.03900v1.pdf","comment":"ICCV 2023. Project page: https://skchen1993.github.io/CEVR_web/"},{"id":"http://arxiv.org/abs/2309.03899v1","updated":"2023-09-07T17:58:05Z","published":"2023-09-07T17:58:05Z","title":"The Making and Breaking of Camouflage","summary":" Not all camouflages are equally effective, as even a partially visible\ncontour or a slight color difference can make the animal stand out and break\nits camouflage. In this paper, we address the question of what makes a\ncamouflage successful, by proposing three scores for automatically assessing\nits effectiveness. In particular, we show that camouflage can be measured by\nthe similarity between background and foreground features and boundary\nvisibility. We use these camouflage scores to assess and compare all available\ncamouflage datasets. We also incorporate the proposed camouflage score into a\ngenerative model as an auxiliary loss and show that effective camouflage images\nor videos can be synthesised in a scalable manner. The generated synthetic\ndataset is used to train a transformer-based model for segmenting camouflaged\nanimals in videos. Experimentally, we demonstrate state-of-the-art camouflage\nbreaking performance on the public MoCA-Mask benchmark.\n","authors":["Hala Lamdouar","Weidi Xie","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2309.03899v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.03897v1","updated":"2023-09-07T17:57:29Z","published":"2023-09-07T17:57:29Z","title":"ProPainter: Improving Propagation and Transformer for Video Inpainting","summary":" Flow-based propagation and spatiotemporal Transformer are two mainstream\nmechanisms in video inpainting (VI). Despite the effectiveness of these\ncomponents, they still suffer from some limitations that affect their\nperformance. Previous propagation-based approaches are performed separately\neither in the image or feature domain. Global image propagation isolated from\nlearning may cause spatial misalignment due to inaccurate optical flow.\nMoreover, memory or computational constraints limit the temporal range of\nfeature propagation and video Transformer, preventing exploration of\ncorrespondence information from distant frames. To address these issues, we\npropose an improved framework, called ProPainter, which involves enhanced\nProPagation and an efficient Transformer. Specifically, we introduce\ndual-domain propagation that combines the advantages of image and feature\nwarping, exploiting global correspondences reliably. We also propose a\nmask-guided sparse video Transformer, which achieves high efficiency by\ndiscarding unnecessary and redundant tokens. With these components, ProPainter\noutperforms prior arts by a large margin of 1.46 dB in PSNR while maintaining\nappealing efficiency.\n","authors":["Shangchen Zhou","Chongyi Li","Kelvin C. K. Chan","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2309.03897v1.pdf","comment":"Accepted by ICCV 2023. Code: https://github.com/sczhou/ProPainter"},{"id":"http://arxiv.org/abs/2309.03895v1","updated":"2023-09-07T17:56:57Z","published":"2023-09-07T17:56:57Z","title":"InstructDiffusion: A Generalist Modeling Interface for Vision Tasks","summary":" We present InstructDiffusion, a unifying and generic framework for aligning\ncomputer vision tasks with human instructions. Unlike existing approaches that\nintegrate prior knowledge and pre-define the output space (e.g., categories and\ncoordinates) for each vision task, we cast diverse vision tasks into a\nhuman-intuitive image-manipulating process whose output space is a flexible and\ninteractive pixel space. Concretely, the model is built upon the diffusion\nprocess and is trained to predict pixels according to user instructions, such\nas encircling the man's left shoulder in red or applying a blue mask to the\nleft car. InstructDiffusion could handle a variety of vision tasks, including\nunderstanding tasks (such as segmentation and keypoint detection) and\ngenerative tasks (such as editing and enhancement). It even exhibits the\nability to handle unseen tasks and outperforms prior methods on novel datasets.\nThis represents a significant step towards a generalist modeling interface for\nvision tasks, advancing artificial general intelligence in the field of\ncomputer vision.\n","authors":["Zigang Geng","Binxin Yang","Tiankai Hang","Chen Li","Shuyang Gu","Ting Zhang","Jianmin Bao","Zheng Zhang","Han Hu","Dong Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2309.03895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03893v1","updated":"2023-09-07T17:55:01Z","published":"2023-09-07T17:55:01Z","title":"DiffusionEngine: Diffusion Model is Scalable Data Engine for Object\n Detection","summary":" Data is the cornerstone of deep learning. This paper reveals that the\nrecently developed Diffusion Model is a scalable data engine for object\ndetection. Existing methods for scaling up detection-oriented data often\nrequire manual collection or generative models to obtain target images,\nfollowed by data augmentation and labeling to produce training pairs, which are\ncostly, complex, or lacking diversity. To address these issues, we\npresentDiffusionEngine (DE), a data scaling-up engine that provides\nhigh-quality detection-oriented training pairs in a single stage. DE consists\nof a pre-trained diffusion model and an effective Detection-Adapter,\ncontributing to generating scalable, diverse and generalizable detection data\nin a plug-and-play manner. Detection-Adapter is learned to align the implicit\nsemantic and location knowledge in off-the-shelf diffusion models with\ndetection-aware signals to make better bounding-box predictions. Additionally,\nwe contribute two datasets, i.e., COCO-DE and VOC-DE, to scale up existing\ndetection benchmarks for facilitating follow-up research. Extensive experiments\ndemonstrate that data scaling-up via DE can achieve significant improvements in\ndiverse scenarios, such as various detection algorithms, self-supervised\npre-training, data-sparse, label-scarce, cross-domain, and semi-supervised\nlearning. For example, when using DE with a DINO-based adapter to scale up\ndata, mAP is improved by 3.1% on COCO, 7.6% on VOC, and 11.5% on Clipart.\n","authors":["Manlin Zhang","Jie Wu","Yuxi Ren","Ming Li","Jie Qin","Xuefeng Xiao","Wei Liu","Rui Wang","Min Zheng","Andy J. Ma"],"pdf_url":"https://arxiv.org/pdf/2309.03893v1.pdf","comment":"Code and Models are publicly available. Project Page:\n https://mettyz.github.io/DiffusionEngine"},{"id":"http://arxiv.org/abs/2309.03891v1","updated":"2023-09-07T17:53:20Z","published":"2023-09-07T17:53:20Z","title":"ArtiGrasp: Physically Plausible Synthesis of Bi-Manual Dexterous\n Grasping and Articulation","summary":" We present ArtiGrasp, a novel method to synthesize bi-manual hand-object\ninteractions that include grasping and articulation. This task is challenging\ndue to the diversity of the global wrist motions and the precise finger control\nthat are necessary to articulate objects. ArtiGrasp leverages reinforcement\nlearning and physics simulations to train a policy that controls the global and\nlocal hand pose. Our framework unifies grasping and articulation within a\nsingle policy guided by a single hand pose reference. Moreover, to facilitate\nthe training of the precise finger control required for articulation, we\npresent a learning curriculum with increasing difficulty. It starts with\nsingle-hand manipulation of stationary objects and continues with multi-agent\ntraining including both hands and non-stationary objects. To evaluate our\nmethod, we introduce Dynamic Object Grasping and Articulation, a task that\ninvolves bringing an object into a target articulated pose. This task requires\ngrasping, relocation, and articulation. We show our method's efficacy towards\nthis task. We further demonstrate that our method can generate motions with\nnoisy hand-object pose estimates from an off-the-shelf image-based regressor.\n","authors":["Hui Zhang","Sammy Christen","Zicong Fan","Luocheng Zheng","Jemin Hwangbo","Jie Song","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2309.03891v1.pdf","comment":"Project page: https://eth-ait.github.io/artigrasp/"},{"id":"http://arxiv.org/abs/2309.03879v1","updated":"2023-09-07T17:44:18Z","published":"2023-09-07T17:44:18Z","title":"Better Practices for Domain Adaptation","summary":" Distribution shifts are all too common in real-world applications of machine\nlearning. Domain adaptation (DA) aims to address this by providing various\nframeworks for adapting models to the deployment data without using labels.\nHowever, the domain shift scenario raises a second more subtle challenge: the\ndifficulty of performing hyperparameter optimisation (HPO) for these adaptation\nalgorithms without access to a labelled validation set. The unclear validation\nprotocol for DA has led to bad practices in the literature, such as performing\nHPO using the target test labels when, in real-world scenarios, they are not\navailable. This has resulted in over-optimism about DA research progress\ncompared to reality. In this paper, we analyse the state of DA when using good\nevaluation practice, by benchmarking a suite of candidate validation criteria\nand using them to assess popular adaptation algorithms. We show that there are\nchallenges across all three branches of domain adaptation methodology including\nUnsupervised Domain Adaptation (UDA), Source-Free Domain Adaptation (SFDA), and\nTest Time Adaptation (TTA). While the results show that realistically\nachievable performance is often worse than expected, they also show that using\nproper validation splits is beneficial, as well as showing that some previously\nunexplored validation metrics provide the best options to date. Altogether, our\nimproved practices covering data, training, validation and hyperparameter\noptimisation form a new rigorous pipeline to improve benchmarking, and hence\nresearch progress, within this important field going forward.\n","authors":["Linus Ericsson","Da Li","Timothy M. Hospedales"],"pdf_url":"https://arxiv.org/pdf/2309.03879v1.pdf","comment":"AutoML 2023 (Best paper award)"},{"id":"http://arxiv.org/abs/2309.03874v1","updated":"2023-09-07T17:36:02Z","published":"2023-09-07T17:36:02Z","title":"Box-based Refinement for Weakly Supervised and Unsupervised Localization\n Tasks","summary":" It has been established that training a box-based detector network can\nenhance the localization performance of weakly supervised and unsupervised\nmethods. Moreover, we extend this understanding by demonstrating that these\ndetectors can be utilized to improve the original network, paving the way for\nfurther advancements. To accomplish this, we train the detectors on top of the\nnetwork output instead of the image data and apply suitable loss\nbackpropagation. Our findings reveal a significant improvement in phrase\ngrounding for the ``what is where by looking'' task, as well as various methods\nof unsupervised object discovery. Our code is available at\nhttps://github.com/eyalgomel/box-based-refinement.\n","authors":["Eyal Gomel","Tal Shaharabany","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2309.03874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03869v1","updated":"2023-09-07T17:30:36Z","published":"2023-09-07T17:30:36Z","title":"Text-to-feature diffusion for audio-visual few-shot learning","summary":" Training deep learning models for video classification from audio-visual data\ncommonly requires immense amounts of labeled training data collected via a\ncostly process. A challenging and underexplored, yet much cheaper, setup is\nfew-shot learning from video data. In particular, the inherently multi-modal\nnature of video data with sound and visual information has not been leveraged\nextensively for the few-shot video classification task. Therefore, we introduce\na unified audio-visual few-shot video classification benchmark on three\ndatasets, i.e. the VGGSound-FSL, UCF-FSL, ActivityNet-FSL datasets, where we\nadapt and compare ten methods. In addition, we propose AV-DIFF, a\ntext-to-feature diffusion framework, which first fuses the temporal and\naudio-visual features via cross-modal attention and then generates multi-modal\nfeatures for the novel classes. We show that AV-DIFF obtains state-of-the-art\nperformance on our proposed benchmark for audio-visual (generalised) few-shot\nlearning. Our benchmark paves the way for effective audio-visual classification\nwhen only limited labeled data is available. Code and data are available at\nhttps://github.com/ExplainableML/AVDIFF-GFSL.\n","authors":["Otniel-Bogdan Mercea","Thomas Hummel","A. Sophia Koepke","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2309.03869v1.pdf","comment":"DAGM GCPR 2023"},{"id":"http://arxiv.org/abs/2211.15341v3","updated":"2023-09-07T17:18:52Z","published":"2022-11-24T18:47:30Z","title":"Non-inferiority of Deep Learning Acute Ischemic Stroke Segmentation on\n Non-Contrast CT Compared to Expert Neuroradiologists","summary":" To determine if a convolutional neural network (CNN) deep learning model can\naccurately segment acute ischemic changes on non-contrast CT compared to\nneuroradiologists. Non-contrast CT (NCCT) examinations from 232 acute ischemic\nstroke patients who were enrolled in the DEFUSE 3 trial were included in this\nstudy. Three experienced neuroradiologists independently segmented hypodensity\nthat reflected the ischemic core on each scan. The neuroradiologist with the\nmost experience (expert A) served as the ground truth for deep learning model\ntraining. Two additional neuroradiologists (experts B and C) segmentations were\nused for data testing. The 232 studies were randomly split into training and\ntest sets. The training set was further randomly divided into 5 folds with\ntraining and validation sets. A 3-dimensional CNN architecture was trained and\noptimized to predict the segmentations of expert A from NCCT. The performance\nof the model was assessed using a set of volume, overlap, and distance metrics\nusing non-inferiority thresholds of 20%, 3ml, and 3mm. The optimized model\ntrained on expert A was compared to test experts B and C. We used a one-sided\nWilcoxon signed-rank test to test for the non-inferiority of the model-expert\ncompared to the inter-expert agreement. The final model performance for the\nischemic core segmentation task reached a performance of 0.46+-0.09 Surface\nDice at Tolerance 5mm and 0.47+-0.13 Dice when trained on expert A. Compared to\nthe two test neuroradiologists the model-expert agreement was non-inferior to\nthe inter-expert agreement, p < 0.05. The CNN accurately delineates the\nhypodense ischemic core on NCCT in acute ischemic stroke patients with an\naccuracy comparable to neuroradiologists.\n","authors":["Sophie Ostmeier","Brian Axelrod","Benjamin F. J. Verhaaren","Soren Christensen","Abdelkader Mahammedi","Yongkai Liu","Benjamin Pulli","Li-Jia Li","Greg Zaharchuk","Jeremy J. Heit"],"pdf_url":"https://arxiv.org/pdf/2211.15341v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03851v1","updated":"2023-09-07T17:07:33Z","published":"2023-09-07T17:07:33Z","title":"CenTime: Event-Conditional Modelling of Censoring in Survival Analysis","summary":" Survival analysis is a valuable tool for estimating the time until specific\nevents, such as death or cancer recurrence, based on baseline observations.\nThis is particularly useful in healthcare to prognostically predict clinically\nimportant events based on patient data. However, existing approaches often have\nlimitations; some focus only on ranking patients by survivability, neglecting\nto estimate the actual event time, while others treat the problem as a\nclassification task, ignoring the inherent time-ordered structure of the\nevents. Furthermore, the effective utilization of censored samples - training\ndata points where the exact event time is unknown - is essential for improving\nthe predictive accuracy of the model. In this paper, we introduce CenTime, a\nnovel approach to survival analysis that directly estimates the time to event.\nOur method features an innovative event-conditional censoring mechanism that\nperforms robustly even when uncensored data is scarce. We demonstrate that our\napproach forms a consistent estimator for the event model parameters, even in\nthe absence of uncensored data. Furthermore, CenTime is easily integrated with\ndeep learning models with no restrictions on batch size or the number of\nuncensored samples. We compare our approach with standard survival analysis\nmethods, including the Cox proportional-hazard model and DeepHit. Our results\nindicate that CenTime offers state-of-the-art performance in predicting\ntime-to-death while maintaining comparable ranking performance. Our\nimplementation is publicly available at\nhttps://github.com/ahmedhshahin/CenTime.\n","authors":["Ahmed H. Shahin","An Zhao","Alexander C. Whitehead","Daniel C. Alexander","Joseph Jacob","David Barber"],"pdf_url":"https://arxiv.org/pdf/2309.03851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03837v1","updated":"2023-09-07T16:50:40Z","published":"2023-09-07T16:50:40Z","title":"Cross-Task Attention Network: Improving Multi-Task Learning for Medical\n Imaging Applications","summary":" Multi-task learning (MTL) is a powerful approach in deep learning that\nleverages the information from multiple tasks during training to improve model\nperformance. In medical imaging, MTL has shown great potential to solve various\ntasks. However, existing MTL architectures in medical imaging are limited in\nsharing information across tasks, reducing the potential performance\nimprovements of MTL. In this study, we introduce a novel attention-based MTL\nframework to better leverage inter-task interactions for various tasks from\npixel-level to image-level predictions. Specifically, we propose a Cross-Task\nAttention Network (CTAN) which utilizes cross-task attention mechanisms to\nincorporate information by interacting across tasks. We validated CTAN on four\nmedical imaging datasets that span different domains and tasks including:\nradiation treatment planning prediction using planning CT images of two\ndifferent target cancers (Prostate, OpenKBP); pigmented skin lesion\nsegmentation and diagnosis using dermatoscopic images (HAM10000); and COVID-19\ndiagnosis and severity prediction using chest CT scans (STOIC). Our study\ndemonstrates the effectiveness of CTAN in improving the accuracy of medical\nimaging tasks. Compared to standard single-task learning (STL), CTAN\ndemonstrated a 4.67% improvement in performance and outperformed both widely\nused MTL baselines: hard parameter sharing (HPS) with an average performance\nimprovement of 3.22%; and multi-task attention network (MTAN) with a relative\ndecrease of 5.38%. These findings highlight the significance of our proposed\nMTL framework in solving medical imaging tasks and its potential to improve\ntheir accuracy across domains.\n","authors":["Sangwook Kim","Thomas G. Purdie","Chris McIntosh"],"pdf_url":"https://arxiv.org/pdf/2309.03837v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.03827v1","updated":"2023-09-07T16:40:49Z","published":"2023-09-07T16:40:49Z","title":"ArtHDR-Net: Perceptually Realistic and Accurate HDR Content Creation","summary":" High Dynamic Range (HDR) content creation has become an important topic for\nmodern media and entertainment sectors, gaming and Augmented/Virtual Reality\nindustries. Many methods have been proposed to recreate the HDR counterparts of\ninput Low Dynamic Range (LDR) images/videos given a single exposure or\nmulti-exposure LDRs. The state-of-the-art methods focus primarily on the\npreservation of the reconstruction's structural similarity and the pixel-wise\naccuracy. However, these conventional approaches do not emphasize preserving\nthe artistic intent of the images in terms of human visual perception, which is\nan essential element in media, entertainment and gaming. In this paper, we\nattempt to study and fill this gap. We propose an architecture called\nArtHDR-Net based on a Convolutional Neural Network that uses multi-exposed LDR\nfeatures as input. Experimental results show that ArtHDR-Net can achieve\nstate-of-the-art performance in terms of the HDR-VDP-2 score (i.e., mean\nopinion score index) while reaching competitive performance in terms of PSNR\nand SSIM.\n","authors":["Hrishav Bakul Barua","Ganesh Krishnasamy","KokSheik Wong","Kalin Stefanov","Abhinav Dhall"],"pdf_url":"https://arxiv.org/pdf/2309.03827v1.pdf","comment":"Accepted in Asia Pacific Signal and Information Processing\n Association Annual Summit and Conference (APSIPA ASC), Taipei, Taiwan"},{"id":"http://arxiv.org/abs/2209.13008v4","updated":"2023-09-07T16:34:17Z","published":"2022-09-26T20:40:02Z","title":"USE-Evaluator: Performance Metrics for Medical Image Segmentation Models\n with Uncertain, Small or Empty Reference Annotations","summary":" Performance metrics for medical image segmentation models are used to measure\nthe agreement between the reference annotation and the predicted segmentation.\nUsually, overlap metrics, such as the Dice, are used as a metric to evaluate\nthe performance of these models in order for results to be comparable. However,\nthere is a mismatch between the distributions of cases and difficulty level of\nsegmentation tasks in public data sets compared to clinical practice. Common\nmetrics fail to measure the impact of this mismatch, especially for clinical\ndata sets that include low signal pathologies, a difficult segmentation task,\nand uncertain, small, or empty reference annotations. This limitation may\nresult in ineffective research of machine learning practitioners in designing\nand optimizing models. Dimensions of evaluating clinical value include\nconsideration of the uncertainty of reference annotations, independence from\nreference annotation volume size, and evaluation of classification of empty\nreference annotations. We study how uncertain, small, and empty reference\nannotations influence the value of metrics for medical image segmentation on an\nin-house data set regardless of the model. We examine metrics behavior on the\npredictions of a standard deep learning framework in order to identify metrics\nwith clinical value. We compare to a public benchmark data set (BraTS 2019)\nwith a high-signal pathology and certain, larger, and no empty reference\nannotations. We may show machine learning practitioners, how uncertain, small,\nor empty reference annotations require a rethinking of the evaluation and\noptimizing procedures. The evaluation code was released to encourage further\nanalysis of this topic.\nhttps://github.com/SophieOstmeier/UncertainSmallEmpty.git\n","authors":["Sophie Ostmeier","Brian Axelrod","Jeroen Bertels","Fabian Isensee","Maarten G. Lansberg","Soren Christensen","Gregory W. Albers","Li-Jia Li","Jeremy J. Heit"],"pdf_url":"https://arxiv.org/pdf/2209.13008v4.pdf","comment":"16 pages, 10 figures, Published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2304.10226v5","updated":"2023-09-07T16:16:10Z","published":"2023-04-20T11:40:21Z","title":"Domain Generalization for Mammographic Image Analysis with Contrastive\n Learning","summary":" The deep learning technique has been shown to be effectively addressed\nseveral image analysis tasks in the computer-aided diagnosis scheme for\nmammography. The training of an efficacious deep learning model requires large\ndata with diverse styles and qualities. The diversity of data often comes from\nthe use of various scanners of vendors. But, in practice, it is impractical to\ncollect a sufficient amount of diverse data for training. To this end, a novel\ncontrastive learning is developed to equip the deep learning models with better\nstyle generalization capability. Specifically, the multi-style and multi-view\nunsupervised self-learning scheme is carried out to seek robust feature\nembedding against style diversity as a pretrained model. Afterward, the\npretrained network is further fine-tuned to the downstream tasks, e.g., mass\ndetection, matching, BI-RADS rating, and breast density classification. The\nproposed method has been evaluated extensively and rigorously with mammograms\nfrom various vendor style domains and several public datasets. The experimental\nresults suggest that the proposed domain generalization method can effectively\nimprove performance of four mammographic image tasks on the data from both seen\nand unseen domains, and outperform many state-of-the-art (SOTA) generalization\nmethods.\n","authors":["Zheren Li","Zhiming Cui","Lichi Zhang","Sheng Wang","Chenjin Lei","Xi Ouyang","Dongdong Chen","Xiangyu Zhao","Yajia Gu","Zaiyi Liu","Chunling Liu","Dinggang Shen","Jie-Zhi Cheng"],"pdf_url":"https://arxiv.org/pdf/2304.10226v5.pdf","comment":"arXiv admin note: text overlap with arXiv:2111.10827"},{"id":"http://arxiv.org/abs/2309.03815v1","updated":"2023-09-07T16:12:06Z","published":"2023-09-07T16:12:06Z","title":"T2IW: Joint Text to Image & Watermark Generation","summary":" Recent developments in text-conditioned image generative models have\nrevolutionized the production of realistic results. Unfortunately, this has\nalso led to an increase in privacy violations and the spread of false\ninformation, which requires the need for traceability, privacy protection, and\nother security measures. However, existing text-to-image paradigms lack the\ntechnical capabilities to link traceable messages with image generation. In\nthis study, we introduce a novel task for the joint generation of text to image\nand watermark (T2IW). This T2IW scheme ensures minimal damage to image quality\nwhen generating a compound image by forcing the semantic feature and the\nwatermark signal to be compatible in pixels. Additionally, by utilizing\nprinciples from Shannon information theory and non-cooperative game theory, we\nare able to separate the revealed image and the revealed watermark from the\ncompound image. Furthermore, we strengthen the watermark robustness of our\napproach by subjecting the compound image to various post-processing attacks,\nwith minimal pixel distortion observed in the revealed watermark. Extensive\nexperiments have demonstrated remarkable achievements in image quality,\nwatermark invisibility, and watermark robustness, supported by our proposed set\nof evaluation metrics.\n","authors":["An-An Liu","Guokai Zhang","Yuting Su","Ning Xu","Yongdong Zhang","Lanjun Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03812v1","updated":"2023-09-07T16:09:06Z","published":"2023-09-07T16:09:06Z","title":"AnthroNet: Conditional Generation of Humans via Anthropometrics","summary":" We present a novel human body model formulated by an extensive set of\nanthropocentric measurements, which is capable of generating a wide range of\nhuman body shapes and poses. The proposed model enables direct modeling of\nspecific human identities through a deep generative architecture, which can\nproduce humans in any arbitrary pose. It is the first of its kind to have been\ntrained end-to-end using only synthetically generated data, which not only\nprovides highly accurate human mesh representations but also allows for precise\nanthropometry of the body. Moreover, using a highly diverse animation library,\nwe articulated our synthetic humans' body and hands to maximize the diversity\nof the learnable priors for model training. Our model was trained on a dataset\nof $100k$ procedurally-generated posed human meshes and their corresponding\nanthropometric measurements. Our synthetic data generator can be used to\ngenerate millions of unique human identities and poses for non-commercial\nacademic research purposes.\n","authors":["Francesco Picetti","Shrinath Deshpande","Jonathan Leban","Soroosh Shahtalebi","Jay Patel","Peifeng Jing","Chunpu Wang","Charles Metze III","Cameron Sun","Cera Laidlaw","James Warren","Kathy Huynh","River Page","Jonathan Hogins","Adam Crespi","Sujoy Ganguly","Salehe Erfanian Ebadi"],"pdf_url":"https://arxiv.org/pdf/2309.03812v1.pdf","comment":"AnthroNet's Unity data generator source code is available at:\n https://unity-technologies.github.io/AnthroNet/"},{"id":"http://arxiv.org/abs/2309.03811v1","updated":"2023-09-07T16:07:31Z","published":"2023-09-07T16:07:31Z","title":"Panoramas from Photons","summary":" Scene reconstruction in the presence of high-speed motion and low\nillumination is important in many applications such as augmented and virtual\nreality, drone navigation, and autonomous robotics. Traditional motion\nestimation techniques fail in such conditions, suffering from too much blur in\nthe presence of high-speed motion and strong noise in low-light conditions.\nSingle-photon cameras have recently emerged as a promising technology capable\nof capturing hundreds of thousands of photon frames per second thanks to their\nhigh speed and extreme sensitivity. Unfortunately, traditional computer vision\ntechniques are not well suited for dealing with the binary-valued photon data\ncaptured by these cameras because these are corrupted by extreme Poisson noise.\nHere we present a method capable of estimating extreme scene motion under\nchallenging conditions, such as low light or high dynamic range, from a\nsequence of high-speed image frames such as those captured by a single-photon\ncamera. Our method relies on iteratively improving a motion estimate by\ngrouping and aggregating frames after-the-fact, in a stratified manner. We\ndemonstrate the creation of high-quality panoramas under fast motion and\nextremely low light, and super-resolution results using a custom single-photon\ncamera prototype. For code and supplemental material see our\n$\\href{https://wisionlab.com/project/panoramas-from-photons/}{\\text{project\nwebpage}}$.\n","authors":["Sacha Jungerman","Atul Ingle","Mohit Gupta"],"pdf_url":"https://arxiv.org/pdf/2309.03811v1.pdf","comment":"Proc. ICCV 2023"},{"id":"http://arxiv.org/abs/2309.03809v1","updated":"2023-09-07T16:02:40Z","published":"2023-09-07T16:02:40Z","title":"SimNP: Learning Self-Similarity Priors Between Neural Points","summary":" Existing neural field representations for 3D object reconstruction either (1)\nutilize object-level representations, but suffer from low-quality details due\nto conditioning on a global latent code, or (2) are able to perfectly\nreconstruct the observations, but fail to utilize object-level prior knowledge\nto infer unobserved regions. We present SimNP, a method to learn category-level\nself-similarities, which combines the advantages of both worlds by connecting\nneural point radiance fields with a category-level self-similarity\nrepresentation. Our contribution is two-fold. (1) We design the first neural\npoint representation on a category level by utilizing the concept of coherent\npoint clouds. The resulting neural point radiance fields store a high level of\ndetail for locally supported object regions. (2) We learn how information is\nshared between neural points in an unconstrained and unsupervised fashion,\nwhich allows to derive unobserved regions of an object during the\nreconstruction process from given observations. We show that SimNP is able to\noutperform previous methods in reconstructing symmetric unseen object regions,\nsurpassing methods that build upon category-level or pixel-aligned radiance\nfields, while providing semantic correspondences between instances\n","authors":["Christopher Wewer","Eddy Ilg","Bernt Schiele","Jan Eric Lenssen"],"pdf_url":"https://arxiv.org/pdf/2309.03809v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2212.01448v2","updated":"2023-09-07T16:01:15Z","published":"2022-12-02T21:16:39Z","title":"PGFed: Personalize Each Client's Global Objective for Federated Learning","summary":" Personalized federated learning has received an upsurge of attention due to\nthe mediocre performance of conventional federated learning (FL) over\nheterogeneous data. Unlike conventional FL which trains a single global\nconsensus model, personalized FL allows different models for different clients.\nHowever, existing personalized FL algorithms only implicitly transfer the\ncollaborative knowledge across the federation by embedding the knowledge into\nthe aggregated model or regularization. We observed that this implicit\nknowledge transfer fails to maximize the potential of each client's empirical\nrisk toward other clients. Based on our observation, in this work, we propose\nPersonalized Global Federated Learning (PGFed), a novel personalized FL\nframework that enables each client to personalize its own global objective by\nexplicitly and adaptively aggregating the empirical risks of itself and other\nclients. To avoid massive (O(N^2)) communication overhead and potential privacy\nleakage while achieving this, each client's risk is estimated through a\nfirst-order approximation for other clients' adaptive risk aggregation. On top\nof PGFed, we develop a momentum upgrade, dubbed PGFedMo, to more efficiently\nutilize clients' empirical risks. Our extensive experiments on four datasets\nunder different federated settings show consistent improvements of PGFed over\nprevious state-of-the-art methods. The code is publicly available at\nhttps://github.com/ljaiverson/pgfed.\n","authors":["Jun Luo","Matias Mendieta","Chen Chen","Shandong Wu"],"pdf_url":"https://arxiv.org/pdf/2212.01448v2.pdf","comment":"ICCV 2023 oral"},{"id":"http://arxiv.org/abs/2309.03799v1","updated":"2023-09-07T15:51:31Z","published":"2023-09-07T15:51:31Z","title":"FisheyePP4AV: A privacy-preserving method for autonomous vehicles on\n fisheye camera images","summary":" In many parts of the world, the use of vast amounts of data collected on\npublic roadways for autonomous driving has increased. In order to detect and\nanonymize pedestrian faces and nearby car license plates in actual road-driving\nscenarios, there is an urgent need for effective solutions. As more data is\ncollected, privacy concerns regarding it increase, including but not limited to\npedestrian faces and surrounding vehicle license plates. Normal and fisheye\ncameras are the two common camera types that are typically mounted on\ncollection vehicles. With complex camera distortion models, fisheye camera\nimages were deformed in contrast to regular images. It causes computer vision\ntasks to perform poorly when using numerous deep learning models. In this work,\nwe pay particular attention to protecting privacy while yet adhering to several\nlaws for fisheye camera photos taken by driverless vehicles. First, we suggest\na framework for extracting face and plate identification knowledge from several\nteacher models. Our second suggestion is to transform both the image and the\nlabel from a regular image to fisheye-like data using a varied and realistic\nfisheye transformation. Finally, we run a test using the open-source PP4AV\ndataset. The experimental findings demonstrated that our model outperformed\nbaseline methods when trained on data from autonomous vehicles, even when the\ndata were softly labeled. The implementation code is available at our github:\nhttps://github.com/khaclinh/FisheyePP4AV.\n","authors":["Linh Trinh","Bach Ha","Tu Tran"],"pdf_url":"https://arxiv.org/pdf/2309.03799v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.07590v2","updated":"2023-09-07T15:32:50Z","published":"2022-11-14T18:16:36Z","title":"Stain-invariant self supervised learning for histopathology image\n analysis","summary":" We present a self-supervised algorithm for several classification tasks\nwithin hematoxylin and eosin (H&E) stained images of breast cancer. Our method\nis robust to stain variations inherent to the histology images acquisition\nprocess, which has limited the applicability of automated analysis tools. We\naddress this problem by imposing constraints a learnt latent space which\nleverages stain normalization techniques during training. At every iteration,\nwe select an image as a normalization target and generate a version of every\nimage in the batch normalized to that target. We minimize the distance between\nthe embeddings that correspond to the same image under different staining\nvariations while maximizing the distance between other samples. We show that\nour method not only improves robustness to stain variations across multi-center\ndata, but also classification performance through extensive experiments on\nvarious normalization targets and methods. Our method achieves the\nstate-of-the-art performance on several publicly available breast cancer\ndatasets ranging from tumor classification (CAMELYON17) and subtyping (BRACS)\nto HER2 status classification and treatment response prediction.\n","authors":["Alexandre Tiard","Alex Wong","David Joon Ho","Yangchao Wu","Eliram Nof","Alvin C. Goh","Stefano Soatto","Saad Nadeem"],"pdf_url":"https://arxiv.org/pdf/2211.07590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03774v1","updated":"2023-09-07T15:25:47Z","published":"2023-09-07T15:25:47Z","title":"Deep Learning Safety Concerns in Automated Driving Perception","summary":" Recent advances in the field of deep learning and impressive performance of\ndeep neural networks (DNNs) for perception have resulted in an increased demand\nfor their use in automated driving (AD) systems. The safety of such systems is\nof utmost importance and thus requires to consider the unique properties of\nDNNs.\n In order to achieve safety of AD systems with DNN-based perception components\nin a systematic and comprehensive approach, so-called safety concerns have been\nintroduced as a suitable structuring element. On the one hand, the concept of\nsafety concerns is -- by design -- well aligned to existing standards relevant\nfor safety of AD systems such as ISO 21448 (SOTIF). On the other hand, it has\nalready inspired several academic publications and upcoming standards on AI\nsafety such as ISO PAS 8800.\n While the concept of safety concerns has been previously introduced, this\npaper extends and refines it, leveraging feedback from various domain and\nsafety experts in the field. In particular, this paper introduces an additional\ncategorization for a better understanding as well as enabling cross-functional\nteams to jointly address the concerns.\n","authors":["Stephanie Abrecht","Alexander Hirsch","Shervin Raafatnia","Matthias Woehrle"],"pdf_url":"https://arxiv.org/pdf/2309.03774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03764v1","updated":"2023-09-07T15:08:12Z","published":"2023-09-07T15:08:12Z","title":"$L_{2,1}$-Norm Regularized Quaternion Matrix Completion Using Sparse\n Representation and Quaternion QR Decomposition","summary":" Color image completion is a challenging problem in computer vision, but\nrecent research has shown that quaternion representations of color images\nperform well in many areas. These representations consider the entire color\nimage and effectively utilize coupling information between the three color\nchannels. Consequently, low-rank quaternion matrix completion (LRQMC)\nalgorithms have gained significant attention. We propose a method based on\nquaternion Qatar Riyal decomposition (QQR) and quaternion $L_{2,1}$-norm called\nQLNM-QQR. This new approach reduces computational complexity by avoiding the\nneed to calculate the QSVD of large quaternion matrices. We also present two\nimprovements to the QLNM-QQR method: an enhanced version called IRQLNM-QQR that\nuses iteratively reweighted quaternion $L_{2,1}$-norm minimization and a method\ncalled QLNM-QQR-SR that integrates sparse regularization. Our experiments on\nnatural color images and color medical images show that IRQLNM-QQR outperforms\nQLNM-QQR and that the proposed QLNM-QQR-SR method is superior to several\nstate-of-the-art methods.\n","authors":["Juan Han","Kit Ian Kou","Jifei Miao","Lizhi Liu","Haojiang Li"],"pdf_url":"https://arxiv.org/pdf/2309.03764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03763v1","updated":"2023-09-07T15:05:35Z","published":"2023-09-07T15:05:35Z","title":"dacl1k: Real-World Bridge Damage Dataset Putting Open-Source Data to the\n Test","summary":" Recognising reinforced concrete defects (RCDs) is a crucial element for\ndetermining the structural integrity, traffic safety and durability of bridges.\nHowever, most of the existing datasets in the RCD domain are derived from a\nsmall number of bridges acquired in specific camera poses, lighting conditions\nand with fixed hardware. These limitations question the usability of models\ntrained on such open-source data in real-world scenarios. We address this\nproblem by testing such models on our \"dacl1k\" dataset, a highly diverse RCD\ndataset for multi-label classification based on building inspections including\n1,474 images. Thereby, we trained the models on different combinations of\nopen-source data (meta datasets) which were subsequently evaluated both\nextrinsically and intrinsically. During extrinsic evaluation, we report metrics\non dacl1k and the meta datasets. The performance analysis on dacl1k shows\npractical usability of the meta data, where the best model shows an Exact Match\nRatio of 32%. Additionally, we conduct an intrinsic evaluation by clustering\nthe bottleneck features of the best model derived from the extrinsic evaluation\nin order to find out, if the model has learned distinguishing datasets or the\nclasses (RCDs) which is the aspired goal. The dacl1k dataset and our trained\nmodels will be made publicly available, enabling researchers and practitioners\nto put their models to the real-world test.\n","authors":["Johannes Flotzinger","Philipp J. Rösch","Norbert Oswald","Thomas Braml"],"pdf_url":"https://arxiv.org/pdf/2309.03763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03759v1","updated":"2023-09-07T15:00:58Z","published":"2023-09-07T15:00:58Z","title":"M(otion)-mode Based Prediction of Ejection Fraction using\n Echocardiograms","summary":" Early detection of cardiac dysfunction through routine screening is vital for\ndiagnosing cardiovascular diseases. An important metric of cardiac function is\nthe left ventricular ejection fraction (EF), where lower EF is associated with\ncardiomyopathy. Echocardiography is a popular diagnostic tool in cardiology,\nwith ultrasound being a low-cost, real-time, and non-ionizing technology.\nHowever, human assessment of echocardiograms for calculating EF is\ntime-consuming and expertise-demanding, raising the need for an automated\napproach. In this work, we propose using the M(otion)-mode of echocardiograms\nfor estimating the EF and classifying cardiomyopathy. We generate multiple\nartificial M-mode images from a single echocardiogram and combine them using\noff-the-shelf model architectures. Additionally, we extend contrastive learning\n(CL) to cardiac imaging to learn meaningful representations from exploiting\nstructures in unlabeled data allowing the model to achieve high accuracy, even\nwith limited annotations. Our experiments show that the supervised setting\nconverges with only ten modes and is comparable to the baseline method while\nbypassing its cumbersome training process and being computationally much more\nefficient. Furthermore, CL using M-mode images is helpful for limited data\nscenarios, such as having labels for only 200 patients, which is common in\nmedical applications.\n","authors":["Ece Ozkan","Thomas M. Sutter","Yurong Hu","Sebastian Balzer","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2309.03759v1.pdf","comment":"Accepted at GCPR 2023"},{"id":"http://arxiv.org/abs/2309.03750v1","updated":"2023-09-07T14:45:41Z","published":"2023-09-07T14:45:41Z","title":"PBP: Path-based Trajectory Prediction for Autonomous Driving","summary":" Trajectory prediction plays a crucial role in the autonomous driving stack by\nenabling autonomous vehicles to anticipate the motion of surrounding agents.\nGoal-based prediction models have gained traction in recent years for\naddressing the multimodal nature of future trajectories. Goal-based prediction\nmodels simplify multimodal prediction by first predicting 2D goal locations of\nagents and then predicting trajectories conditioned on each goal. However, a\nsingle 2D goal location serves as a weak inductive bias for predicting the\nwhole trajectory, often leading to poor map compliance, i.e., part of the\ntrajectory going off-road or breaking traffic rules. In this paper, we improve\nupon goal-based prediction by proposing the Path-based prediction (PBP)\napproach. PBP predicts a discrete probability distribution over reference paths\nin the HD map using the path features and predicts trajectories in the\npath-relative Frenet frame. We applied the PBP trajectory decoder on top of the\nHiVT scene encoder and report results on the Argoverse dataset. Our experiments\nshow that PBP achieves competitive performance on the standard trajectory\nprediction metrics, while significantly outperforming state-of-the-art\nbaselines in terms of map compliance.\n","authors":["Sepideh Afshar","Nachiket Deo","Akshay Bhagat","Titas Chakraborty","Yunming Shao","Balarama Raju Buddharaju","Adwait Deshpande","Henggang Cui"],"pdf_url":"https://arxiv.org/pdf/2309.03750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16215v3","updated":"2023-09-07T14:41:22Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control","summary":" Lossy video compression is commonly used when transmitting and storing video\ndata. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard,\ndespite the availability of advanced (neural) compression approaches.\nTransmitting videos in the face of dynamic network bandwidth conditions\nrequires video codecs to adapt to vastly different compression strengths. Rate\ncontrol modules augment the codec's compression such that bandwidth constraints\nare satisfied and video distortion is minimized. While, both standard video\ncodes and their rate control modules are developed to minimize video distortion\nw.r.t. human quality assessment, preserving the downstream performance of deep\nvision models is not considered. In this paper, we present the first end-to-end\nlearnable deep video codec control considering both bandwidth constraints and\ndownstream vision performance, while not breaking existing standardization. We\ndemonstrate for two common vision tasks (semantic segmentation and optical flow\nestimation) and on two different datasets that our deep codec control better\npreserves downstream performance than using 2-pass average bit rate control\nwhile meeting dynamic bandwidth constraints and adhering to standardizations.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v3.pdf","comment":"22 pages, 26 figures, 6 tables"},{"id":"http://arxiv.org/abs/2307.12900v2","updated":"2023-09-07T14:40:25Z","published":"2023-07-24T15:47:21Z","title":"Automotive Object Detection via Learning Sparse Events by Spiking\n Neurons","summary":" Event-based sensors, distinguished by their high temporal resolution of 1\n{\\mu}s and a dynamic range of 120 dB, stand out as ideal tools for deployment\nin fast-paced settings like vehicles and drones. Traditional object detection\ntechniques that utilize Artificial Neural Networks (ANNs) face challenges due\nto the sparse and asynchronous nature of the events these sensors capture. In\ncontrast, Spiking Neural Networks (SNNs) offer a promising alternative,\nproviding a temporal representation that is inherently aligned with event-based\ndata. This paper explores the unique membrane potential dynamics of SNNs and\ntheir ability to modulate sparse events. We introduce an innovative\nspike-triggered adaptive threshold mechanism designed for stable training.\nBuilding on these insights, we present a specialized spiking feature pyramid\nnetwork (SpikeFPN) optimized for automotive event based object detection.\nComprehensive evaluations demonstrate that SpikeFPN surpasses both traditional\nSNNs and advanced ANNs enhanced with attention mechanisms. Evidently, SpikeFPN\nachieves a mean Average Precision (mAP) of 0.477 on the GEN1 Automotive\nDetection (GAD) benchmark dataset, marking a significant increase of 9.7% over\nthe previous best SNN. Moreover, the efficient design of SpikeFPN ensures\nrobust performance while optimizing computational resources, attributed to its\ninnate sparse computation capabilities.\n","authors":["Hu Zhang","Yanchen Li","Luziwei Leng","Kaiwei Che","Qian Liu","Qinghai Guo","Jianxing Liao","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.12900v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03744v1","updated":"2023-09-07T14:37:50Z","published":"2023-09-07T14:37:50Z","title":"Label-efficient Contrastive Learning-based model for nuclei detection\n and classification in 3D Cardiovascular Immunofluorescent Images","summary":" Recently, deep learning-based methods achieved promising performance in\nnuclei detection and classification applications. However, training deep\nlearning-based methods requires a large amount of pixel-wise annotated data,\nwhich is time-consuming and labor-intensive, especially in 3D images. An\nalternative approach is to adapt weak-annotation methods, such as labeling each\nnucleus with a point, but this method does not extend from 2D histopathology\nimages (for which it was originally developed) to 3D immunofluorescent images.\nThe reason is that 3D images contain multiple channels (z-axis) for nuclei and\ndifferent markers separately, which makes training using point annotations\ndifficult. To address this challenge, we propose the Label-efficient\nContrastive learning-based (LECL) model to detect and classify various types of\nnuclei in 3D immunofluorescent images. Previous methods use Maximum Intensity\nProjection (MIP) to convert immunofluorescent images with multiple slices to 2D\nimages, which can cause signals from different z-stacks to falsely appear\nassociated with each other. To overcome this, we devised an Extended Maximum\nIntensity Projection (EMIP) approach that addresses issues using MIP.\nFurthermore, we performed a Supervised Contrastive Learning (SCL) approach for\nweakly supervised settings. We conducted experiments on cardiovascular datasets\nand found that our proposed framework is effective and efficient in detecting\nand classifying various types of nuclei in 3D immunofluorescent images.\n","authors":["Nazanin Moradinasab","Rebecca A. Deaton","Laura S. Shankman","Gary K. Owens","Donald E. Brown"],"pdf_url":"https://arxiv.org/pdf/2309.03744v1.pdf","comment":"11 pages, 5 figures, MICCAI Workshop Conference 2023"},{"id":"http://arxiv.org/abs/2309.03734v1","updated":"2023-09-07T14:23:47Z","published":"2023-09-07T14:23:47Z","title":"ClusterFusion: Leveraging Radar Spatial Features for Radar-Camera 3D\n Object Detection in Autonomous Vehicles","summary":" Thanks to the complementary nature of millimeter wave radar and camera, deep\nlearning-based radar-camera 3D object detection methods may reliably produce\naccurate detections even in low-visibility conditions. This makes them\npreferable to use in autonomous vehicles' perception systems, especially as the\ncombined cost of both sensors is cheaper than the cost of a lidar. Recent\nradar-camera methods commonly perform feature-level fusion which often involves\nprojecting the radar points onto the same plane as the image features and\nfusing the extracted features from both modalities. While performing fusion on\nthe image plane is generally simpler and faster, projecting radar points onto\nthe image plane flattens the depth dimension of the point cloud which might\nlead to information loss and makes extracting the spatial features of the point\ncloud harder. We proposed ClusterFusion, an architecture that leverages the\nlocal spatial features of the radar point cloud by clustering the point cloud\nand performing feature extraction directly on the point cloud clusters before\nprojecting the features onto the image plane. ClusterFusion achieved the\nstate-of-the-art performance among all radar-monocular camera methods on the\ntest slice of the nuScenes dataset with 48.7% nuScenes detection score (NDS).\nWe also investigated the performance of different radar feature extraction\nstrategies on point cloud clusters: a handcrafted strategy, a learning-based\nstrategy, and a combination of both, and found that the handcrafted strategy\nyielded the best performance. The main goal of this work is to explore the use\nof radar's local spatial and point-wise features by extracting them directly\nfrom radar point cloud clusters for a radar-monocular camera 3D object\ndetection method that performs cross-modal feature fusion on the image plane.\n","authors":["Irfan Tito Kurniawan","Bambang Riyanto Trilaksono"],"pdf_url":"https://arxiv.org/pdf/2309.03734v1.pdf","comment":"Submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2303.13606v2","updated":"2023-09-07T14:21:10Z","published":"2023-03-23T18:40:17Z","title":"Adaptive Similarity Bootstrapping for Self-Distillation based\n Representation Learning","summary":" Most self-supervised methods for representation learning leverage a\ncross-view consistency objective i.e., they maximize the representation\nsimilarity of a given image's augmented views. Recent work NNCLR goes beyond\nthe cross-view paradigm and uses positive pairs from different images obtained\nvia nearest neighbor bootstrapping in a contrastive setting. We empirically\nshow that as opposed to the contrastive learning setting which relies on\nnegative samples, incorporating nearest neighbor bootstrapping in a\nself-distillation scheme can lead to a performance drop or even collapse. We\nscrutinize the reason for this unexpected behavior and provide a solution. We\npropose to adaptively bootstrap neighbors based on the estimated quality of the\nlatent space. We report consistent improvements compared to the naive\nbootstrapping approach and the original baselines. Our approach leads to\nperformance improvements for various self-distillation method/backbone\ncombinations and standard downstream tasks. Our code is publicly available at\nhttps://github.com/tileb1/AdaSim.\n","authors":["Tim Lebailly","Thomas Stegmüller","Behzad Bozorgtabar","Jean-Philippe Thiran","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2303.13606v2.pdf","comment":"ICCV 2023. * denotes equal contribution"},{"id":"http://arxiv.org/abs/2309.03729v1","updated":"2023-09-07T14:14:11Z","published":"2023-09-07T14:14:11Z","title":"Phasic Content Fusing Diffusion Model with Directional Distribution\n Consistency for Few-Shot Model Adaption","summary":" Training a generative model with limited number of samples is a challenging\ntask. Current methods primarily rely on few-shot model adaption to train the\nnetwork. However, in scenarios where data is extremely limited (less than 10),\nthe generative network tends to overfit and suffers from content degradation.\nTo address these problems, we propose a novel phasic content fusing few-shot\ndiffusion model with directional distribution consistency loss, which targets\ndifferent learning objectives at distinct training stages of the diffusion\nmodel. Specifically, we design a phasic training strategy with phasic content\nfusion to help our model learn content and style information when t is large,\nand learn local details of target domain when t is small, leading to an\nimprovement in the capture of content, style and local details. Furthermore, we\nintroduce a novel directional distribution consistency loss that ensures the\nconsistency between the generated and source distributions more efficiently and\nstably than the prior methods, preventing our model from overfitting. Finally,\nwe propose a cross-domain structure guidance strategy that enhances structure\nconsistency during domain adaptation. Theoretical analysis, qualitative and\nquantitative experiments demonstrate the superiority of our approach in\nfew-shot generative model adaption tasks compared to state-of-the-art methods.\nThe source code is available at:\nhttps://github.com/sjtuplayer/few-shot-diffusion.\n","authors":["Teng Hu","Jiangning Zhang","Liang Liu","Ran Yi","Siqi Kou","Haokun Zhu","Xu Chen","Yabiao Wang","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2309.03729v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2309.03726v1","updated":"2023-09-07T14:12:31Z","published":"2023-09-07T14:12:31Z","title":"Interpretable Visual Question Answering via Reasoning Supervision","summary":" Transformer-based architectures have recently demonstrated remarkable\nperformance in the Visual Question Answering (VQA) task. However, such models\nare likely to disregard crucial visual cues and often rely on multimodal\nshortcuts and inherent biases of the language modality to predict the correct\nanswer, a phenomenon commonly referred to as lack of visual grounding. In this\nwork, we alleviate this shortcoming through a novel architecture for visual\nquestion answering that leverages common sense reasoning as a supervisory\nsignal. Reasoning supervision takes the form of a textual justification of the\ncorrect answer, with such annotations being already available on large-scale\nVisual Common Sense Reasoning (VCR) datasets. The model's visual attention is\nguided toward important elements of the scene through a similarity loss that\naligns the learned attention distributions guided by the question and the\ncorrect reasoning. We demonstrate both quantitatively and qualitatively that\nthe proposed approach can boost the model's visual perception capability and\nlead to performance increase, without requiring training on explicit grounding\nannotations.\n","authors":["Maria Parelli","Dimitrios Mallis","Markos Diomataris","Vassilis Pitsikalis"],"pdf_url":"https://arxiv.org/pdf/2309.03726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03722v1","updated":"2023-09-07T13:58:31Z","published":"2023-09-07T13:58:31Z","title":"A boundary-aware point clustering approach in Euclidean and embedding\n spaces for roof plane segmentation","summary":" Roof plane segmentation from airborne LiDAR point clouds is an important\ntechnology for 3D building model reconstruction. One of the key issues of plane\nsegmentation is how to design powerful features that can exactly distinguish\nadjacent planar patches. The quality of point feature directly determines the\naccuracy of roof plane segmentation. Most of existing approaches use\nhandcrafted features to extract roof planes. However, the abilities of these\nfeatures are relatively low, especially in boundary area. To solve this\nproblem, we propose a boundary-aware point clustering approach in Euclidean and\nembedding spaces constructed by a multi-task deep network for roof plane\nsegmentation. We design a three-branch network to predict semantic labels,\npoint offsets and extract deep embedding features. In the first branch, we\nclassify the input data as non-roof, boundary and plane points. In the second\nbranch, we predict point offsets for shifting each point toward its respective\ninstance center. In the third branch, we constrain that points of the same\nplane instance should have the similar embeddings. We aim to ensure that points\nof the same plane instance are close as much as possible in both Euclidean and\nembedding spaces. However, although deep network has strong feature\nrepresentative ability, it is still hard to accurately distinguish points near\nplane instance boundary. Therefore, we first group plane points into many\nclusters in the two spaces, and then we assign the rest boundary points to\ntheir closest clusters to generate final complete roof planes. In this way, we\ncan effectively reduce the influence of unreliable boundary points. In\naddition, we construct a synthetic dataset and a real dataset to train and\nevaluate our approach. The experiments results show that the proposed approach\nsignificantly outperforms the existing state-of-the-art approaches.\n","authors":["Li Li","Qingqing Li","Guozheng Xu","Pengwei Zhou","Jingmin Tu","Jie Li","Jian Yao"],"pdf_url":"https://arxiv.org/pdf/2309.03722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01728v2","updated":"2023-09-07T13:40:14Z","published":"2023-09-04T17:22:10Z","title":"Generative-based Fusion Mechanism for Multi-Modal Tracking","summary":" Generative models (GMs) have received increasing research interest for their\nremarkable capacity to achieve comprehensive understanding. However, their\npotential application in the domain of multi-modal tracking has remained\nrelatively unexplored. In this context, we seek to uncover the potential of\nharnessing generative techniques to address the critical challenge, information\nfusion, in multi-modal tracking. In this paper, we delve into two prominent GM\ntechniques, namely, Conditional Generative Adversarial Networks (CGANs) and\nDiffusion Models (DMs). Different from the standard fusion process where the\nfeatures from each modality are directly fed into the fusion block, we\ncondition these multi-modal features with random noise in the GM framework,\neffectively transforming the original training samples into harder instances.\nThis design excels at extracting discriminative clues from the features,\nenhancing the ultimate tracking performance. To quantitatively gauge the\neffectiveness of our approach, we conduct extensive experiments across two\nmulti-modal tracking tasks, three baseline methods, and three challenging\nbenchmarks. The experimental results demonstrate that the proposed\ngenerative-based fusion mechanism achieves state-of-the-art performance,\nsetting new records on LasHeR and RGBD1K.\n","authors":["Zhangyong Tang","Tianyang Xu","Xuefeng Zhu","Xiao-Jun Wu","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2309.01728v2.pdf","comment":"10 figures, 8 tables"},{"id":"http://arxiv.org/abs/2309.03702v1","updated":"2023-09-07T13:28:36Z","published":"2023-09-07T13:28:36Z","title":"DiffDefense: Defending against Adversarial Attacks via Diffusion Models","summary":" This paper presents a novel reconstruction method that leverages Diffusion\nModels to protect machine learning classifiers against adversarial attacks, all\nwithout requiring any modifications to the classifiers themselves. The\nsusceptibility of machine learning models to minor input perturbations renders\nthem vulnerable to adversarial attacks. While diffusion-based methods are\ntypically disregarded for adversarial defense due to their slow reverse\nprocess, this paper demonstrates that our proposed method offers robustness\nagainst adversarial threats while preserving clean accuracy, speed, and\nplug-and-play compatibility. Code at:\nhttps://github.com/HondamunigePrasannaSilva/DiffDefence.\n","authors":["Hondamunige Prasanna Silva","Lorenzo Seidenari","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2309.03702v1.pdf","comment":"Paper published at ICIAP23"},{"id":"http://arxiv.org/abs/2302.08272v2","updated":"2023-09-07T13:21:40Z","published":"2023-02-16T13:04:59Z","title":"Revisiting Hidden Representations in Transfer Learning for Medical\n Imaging","summary":" While a key component to the success of deep learning is the availability of\nmassive amounts of training data, medical image datasets are often limited in\ndiversity and size. Transfer learning has the potential to bridge the gap\nbetween related yet different domains. For medical applications, however, it\nremains unclear whether it is more beneficial to pre-train on natural or\nmedical images. We aim to shed light on this problem by comparing\ninitialization on ImageNet and RadImageNet on seven medical classification\ntasks. Our work includes a replication study, which yields results contrary to\npreviously published findings. In our experiments, ResNet50 models pre-trained\non ImageNet tend to outperform those trained on RadImageNet. To gain further\ninsights, we investigate the learned representations using Canonical\nCorrelation Analysis (CCA) and compare the predictions of the different models.\nOur results indicate that, contrary to intuition, ImageNet and RadImageNet may\nconverge to distinct intermediate representations, which appear to diverge\nfurther during fine-tuning. Despite these distinct representations, the\npredictions of the models remain similar. Our findings show that the similarity\nbetween networks before and after fine-tuning does not correlate with\nperformance gains, suggesting that the advantages of transfer learning might\nnot solely originate from the reuse of features in the early layers of a\nconvolutional neural network.\n","authors":["Dovile Juodelyte","Amelia Jiménez-Sánchez","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2302.08272v2.pdf","comment":"Submitted to TMLR"},{"id":"http://arxiv.org/abs/2309.03696v1","updated":"2023-09-07T13:10:06Z","published":"2023-09-07T13:10:06Z","title":"Efficient Adaptive Human-Object Interaction Detection with\n Concept-guided Memory","summary":" Human Object Interaction (HOI) detection aims to localize and infer the\nrelationships between a human and an object. Arguably, training supervised\nmodels for this task from scratch presents challenges due to the performance\ndrop over rare classes and the high computational cost and time required to\nhandle long-tailed distributions of HOIs in complex HOI scenes in realistic\nsettings. This observation motivates us to design an HOI detector that can be\ntrained even with long-tailed labeled data and can leverage existing knowledge\nfrom pre-trained models. Inspired by the powerful generalization ability of the\nlarge Vision-Language Models (VLM) on classification and retrieval tasks, we\npropose an efficient Adaptive HOI Detector with Concept-guided Memory (ADA-CM).\nADA-CM has two operating modes. The first mode makes it tunable without\nlearning new parameters in a training-free paradigm. Its second mode\nincorporates an instance-aware adapter mechanism that can further efficiently\nboost performance if updating a lightweight set of parameters can be afforded.\nOur proposed method achieves competitive results with state-of-the-art on the\nHICO-DET and V-COCO datasets with much less training time. Code can be found at\nhttps://github.com/ltttpku/ADA-CM.\n","authors":["Ting Lei","Fabian Caba","Qingchao Chen","Hailin Jin","Yuxin Peng","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.03696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13455v3","updated":"2023-09-07T13:01:27Z","published":"2023-06-23T11:53:43Z","title":"DreamEditor: Text-Driven 3D Scene Editing with Neural Fields","summary":" Neural fields have achieved impressive advancements in view synthesis and\nscene reconstruction. However, editing these neural fields remains challenging\ndue to the implicit encoding of geometry and texture information. In this\npaper, we propose DreamEditor, a novel framework that enables users to perform\ncontrolled editing of neural fields using text prompts. By representing scenes\nas mesh-based neural fields, DreamEditor allows localized editing within\nspecific regions. DreamEditor utilizes the text encoder of a pretrained\ntext-to-Image diffusion model to automatically identify the regions to be\nedited based on the semantics of the text prompts. Subsequently, DreamEditor\noptimizes the editing region and aligns its geometry and texture with the text\nprompts through score distillation sampling [29]. Extensive experiments have\ndemonstrated that DreamEditor can accurately edit neural fields of real-world\nscenes according to the given text prompts while ensuring consistency in\nirrelevant areas. DreamEditor generates highly realistic textures and geometry,\nsignificantly surpassing previous works in both quantitative and qualitative\nevaluations.\n","authors":["Jingyu Zhuang","Chen Wang","Lingjie Liu","Liang Lin","Guanbin Li"],"pdf_url":"https://arxiv.org/pdf/2306.13455v3.pdf","comment":"Accepted by SIGGRAPH Asia 2023"},{"id":"http://arxiv.org/abs/2309.03686v1","updated":"2023-09-07T13:00:27Z","published":"2023-09-07T13:00:27Z","title":"MS-UNet-v2: Adaptive Denoising Method and Training Strategy for Medical\n Image Segmentation with Small Training Data","summary":" Models based on U-like structures have improved the performance of medical\nimage segmentation. However, the single-layer decoder structure of U-Net is too\n\"thin\" to exploit enough information, resulting in large semantic differences\nbetween the encoder and decoder parts. Things get worse if the number of\ntraining sets of data is not sufficiently large, which is common in medical\nimage processing tasks where annotated data are more difficult to obtain than\nother tasks. Based on this observation, we propose a novel U-Net model named\nMS-UNet for the medical image segmentation task in this study. Instead of the\nsingle-layer U-Net decoder structure used in Swin-UNet and TransUnet, we\nspecifically design a multi-scale nested decoder based on the Swin Transformer\nfor U-Net. The proposed multi-scale nested decoder structure allows the feature\nmapping between the decoder and encoder to be semantically closer, thus\nenabling the network to learn more detailed features. In addition, we propose a\nnovel edge loss and a plug-and-play fine-tuning Denoising module, which not\nonly effectively improves the segmentation performance of MS-UNet, but could\nalso be applied to other models individually. Experimental results show that\nMS-UNet could effectively improve the network performance with more efficient\nfeature learning capability and exhibit more advanced performance, especially\nin the extreme case with a small amount of training data, and the proposed Edge\nloss and Denoising module could significantly enhance the segmentation\nperformance of MS-UNet.\n","authors":["Haoyuan Chen","Yufei Han","Pin Xu","Yanyi Li","Kuan Li","Jianping Yin"],"pdf_url":"https://arxiv.org/pdf/2309.03686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06145v2","updated":"2023-09-07T12:56:49Z","published":"2023-06-09T10:34:18Z","title":"LDMRes-Net: Enabling Efficient Medical Image Segmentation on IoT and\n Edge Platforms","summary":" In this study, we propose LDMRes-Net, a lightweight dual-multiscale residual\nblock-based computational neural network tailored for medical image\nsegmentation on IoT and edge platforms. Conventional U-Net-based models face\nchallenges in meeting the speed and efficiency demands of real-time clinical\napplications, such as disease monitoring, radiation therapy, and image-guided\nsurgery. LDMRes-Net overcomes these limitations with its remarkably low number\nof learnable parameters (0.072M), making it highly suitable for\nresource-constrained devices. The model's key innovation lies in its dual\nmulti-residual block architecture, which enables the extraction of refined\nfeatures on multiple scales, enhancing overall segmentation performance. To\nfurther optimize efficiency, the number of filters is carefully selected to\nprevent overlap, reduce training time, and improve computational efficiency.\nThe study includes comprehensive evaluations, focusing on segmentation of the\nretinal image of vessels and hard exudates crucial for the diagnosis and\ntreatment of ophthalmology. The results demonstrate the robustness,\ngeneralizability, and high segmentation accuracy of LDMRes-Net, positioning it\nas an efficient tool for accurate and rapid medical image segmentation in\ndiverse clinical applications, particularly on IoT and edge platforms. Such\nadvances hold significant promise for improving healthcare outcomes and\nenabling real-time medical image analysis in resource-limited settings.\n","authors":["Shahzaib Iqbal","Tariq M. Khan","Syed S. Naqvi","Muhammad Usman","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2306.06145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01582v2","updated":"2023-09-07T12:30:06Z","published":"2023-09-04T13:10:11Z","title":"Improving Visual Quality and Transferability of Adversarial Attacks on\n Face Recognition Simultaneously with Adversarial Restoration","summary":" Adversarial face examples possess two critical properties: Visual Quality and\nTransferability. However, existing approaches rarely address these properties\nsimultaneously, leading to subpar results. To address this issue, we propose a\nnovel adversarial attack technique known as Adversarial Restoration\n(AdvRestore), which enhances both visual quality and transferability of\nadversarial face examples by leveraging a face restoration prior. In our\napproach, we initially train a Restoration Latent Diffusion Model (RLDM)\ndesigned for face restoration. Subsequently, we employ the inference process of\nRLDM to generate adversarial face examples. The adversarial perturbations are\napplied to the intermediate features of RLDM. Additionally, by treating RLDM\nface restoration as a sibling task, the transferability of the generated\nadversarial face examples is further improved. Our experimental results\nvalidate the effectiveness of the proposed attack method.\n","authors":["Fengfan Zhou","Hefei Ling","Yuxuan Shi","Jiazhong Chen","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2309.01582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00452v2","updated":"2023-09-07T12:22:28Z","published":"2023-08-01T11:05:13Z","title":"A Majority Invariant Approach to Patch Robustness Certification for Deep\n Learning Models","summary":" Patch robustness certification ensures no patch within a given bound on a\nsample can manipulate a deep learning model to predict a different label.\nHowever, existing techniques cannot certify samples that cannot meet their\nstrict bars at the classifier or patch region levels. This paper proposes\nMajorCert. MajorCert firstly finds all possible label sets manipulatable by the\nsame patch region on the same sample across the underlying classifiers, then\nenumerates their combinations element-wise, and finally checks whether the\nmajority invariant of all these combinations is intact to certify samples.\n","authors":["Qilin Zhou","Zhengyuan Wei","Haipeng Wang","W. K. Chan"],"pdf_url":"https://arxiv.org/pdf/2308.00452v2.pdf","comment":"5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track"},{"id":"http://arxiv.org/abs/2309.03671v1","updated":"2023-09-07T12:19:51Z","published":"2023-09-07T12:19:51Z","title":"Dataset Generation and Bonobo Classification from Weakly Labelled Videos","summary":" This paper presents a bonobo detection and classification pipeline built from\nthe commonly used machine learning methods. Such application is motivated by\nthe need to test bonobos in their enclosure using touch screen devices without\nhuman assistance. This work introduces a newly acquired dataset based on bonobo\nrecordings generated semi-automatically. The recordings are weakly labelled and\nfed to a macaque detector in order to spatially detect the individual present\nin the video. Handcrafted features coupled with different classification\nalgorithms and deep-learning methods using a ResNet architecture are\ninvestigated for bonobo identification. Performance is compared in terms of\nclassification accuracy on the splits of the database using different data\nseparation methods. We demonstrate the importance of data preparation and how a\nwrong data separation can lead to false good results. Finally, after a\nmeaningful separation of the data, the best classification performance is\nobtained using a fine-tuned ResNet model and reaches 75% of accuracy.\n","authors":["Pierre-Etienne Martin"],"pdf_url":"https://arxiv.org/pdf/2309.03671v1.pdf","comment":"IntelliSys 2023 paper"},{"id":"http://arxiv.org/abs/2306.07050v2","updated":"2023-09-07T12:02:47Z","published":"2023-06-12T11:55:33Z","title":"Revisiting Token Pruning for Object Detection and Instance Segmentation","summary":" Vision Transformers (ViTs) have shown impressive performance in computer\nvision, but their high computational cost, quadratic in the number of tokens,\nlimits their adoption in computation-constrained applications. However, this\nlarge number of tokens may not be necessary, as not all tokens are equally\nimportant. In this paper, we investigate token pruning to accelerate inference\nfor object detection and instance segmentation, extending prior works from\nimage classification. Through extensive experiments, we offer four insights for\ndense tasks: (i) tokens should not be completely pruned and discarded, but\nrather preserved in the feature maps for later use. (ii) reactivating\npreviously pruned tokens can further enhance model performance. (iii) a dynamic\npruning rate based on images is better than a fixed pruning rate. (iv) a\nlightweight, 2-layer MLP can effectively prune tokens, achieving accuracy\ncomparable with complex gating networks with a simpler design. We evaluate the\nimpact of these design choices on COCO dataset and present a method integrating\nthese insights that outperforms prior art token pruning models, significantly\nreducing performance drop from ~1.5 mAP to ~0.3 mAP for both boxes and masks.\nCompared to the dense counterpart that uses all tokens, our method achieves up\nto 34% faster inference speed for the whole network and 46% for the backbone.\n","authors":["Yifei Liu","Mathias Gehrig","Nico Messikommer","Marco Cannici","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2306.07050v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03661v1","updated":"2023-09-07T11:58:34Z","published":"2023-09-07T11:58:34Z","title":"Prompt-based Context- and Domain-aware Pretraining for Vision and\n Language Navigation","summary":" With strong representation capabilities, pretrained vision-language models\nare widely used in vision and language navigation (VLN). However, most of them\nare trained on web-crawled general-purpose datasets, which incurs a\nconsiderable domain gap when used for VLN tasks. Another challenge for VLN is\nhow the agent understands the contextual relations between actions on a\ntrajectory and performs cross-modal alignment sequentially. In this paper, we\npropose a novel Prompt-bAsed coNtext- and Domain-Aware (PANDA) pretraining\nframework to address these problems. It performs prompting in two stages. In\nthe domain-aware stage, we apply a low-cost prompt tuning paradigm to learn\nsoft visual prompts from an in-domain dataset for equipping the pretrained\nmodels with object-level and scene-level cross-modal alignment in VLN tasks.\nFurthermore, in the context-aware stage, we design a set of hard context\nprompts to capture the sequence-level semantics and instill both out-of-context\nand contextual knowledge in the instruction into cross-modal representations.\nThey enable further tuning of the pretrained models via contrastive learning.\nExperimental results on both R2R and REVERIE show the superiority of PANDA\ncompared to previous state-of-the-art methods.\n","authors":["Ting Liu","Wansen Wu","Yue Hu","Youkai Wang","Kai Xu","Quanjun Yin"],"pdf_url":"https://arxiv.org/pdf/2309.03661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03659v1","updated":"2023-09-07T11:56:23Z","published":"2023-09-07T11:56:23Z","title":"Towards Comparable Knowledge Distillation in Semantic Image Segmentation","summary":" Knowledge Distillation (KD) is one proposed solution to large model sizes and\nslow inference speed in semantic segmentation. In our research we identify 25\nproposed distillation loss terms from 14 publications in the last 4 years.\nUnfortunately, a comparison of terms based on published results is often\nimpossible, because of differences in training configurations. A good\nillustration of this problem is the comparison of two publications from 2022.\nUsing the same models and dataset, Structural and Statistical Texture\nDistillation (SSTKD) reports an increase of student mIoU of 4.54 and a final\nperformance of 29.19, while Adaptive Perspective Distillation (APD) only\nimproves student performance by 2.06 percentage points, but achieves a final\nperformance of 39.25. The reason for such extreme differences is often a\nsuboptimal choice of hyperparameters and a resulting underperformance of the\nstudent model used as reference point. In our work, we reveal problems of\ninsufficient hyperparameter tuning by showing that distillation improvements of\ntwo widely accepted frameworks, SKD and IFVD, vanish when hyperparameters are\noptimized sufficiently. To improve comparability of future research in the\nfield, we establish a solid baseline for three datasets and two student models\nand provide extensive information on hyperparameter tuning. We find that only\ntwo out of eight techniques can compete with our simple baseline on the ADE20K\ndataset.\n","authors":["Onno Niemann","Christopher Vox","Thorben Werner"],"pdf_url":"https://arxiv.org/pdf/2309.03659v1.pdf","comment":"Accepted by the ECML PKDD 2023 workshop track: Simplification,\n Compression, Efficiency, and Frugality for Artificial Intelligence (SCEFA).\n This preprint has not undergone peer review or any post-submission\n improvements or corrections"},{"id":"http://arxiv.org/abs/2309.01627v2","updated":"2023-09-07T11:52:42Z","published":"2023-09-04T14:18:00Z","title":"Cross-Consistent Deep Unfolding Network for Adaptive All-In-One Video\n Restoration","summary":" Existing Video Restoration (VR) methods always necessitate the individual\ndeployment of models for each adverse weather to remove diverse adverse weather\ndegradations, lacking the capability for adaptive processing of degradations.\nSuch limitation amplifies the complexity and deployment costs in practical\napplications. To overcome this deficiency, in this paper, we propose a\nCross-consistent Deep Unfolding Network (CDUN) for All-In-One VR, which enables\nthe employment of a single model to remove diverse degradations for the first\ntime. Specifically, the proposed CDUN accomplishes a novel iterative\noptimization framework, capable of restoring frames corrupted by corresponding\ndegradations according to the degradation features given in advance. To empower\nthe framework for eliminating diverse degradations, we devise a Sequence-wise\nAdaptive Degradation Estimator (SADE) to estimate degradation features for the\ninput corrupted video. By orchestrating these two cascading procedures, CDUN\nachieves adaptive processing for diverse degradation. In addition, we introduce\na window-based inter-frame fusion strategy to utilize information from more\nadjacent frames. This strategy involves the progressive stacking of temporal\nwindows in multiple iterations, effectively enlarging the temporal receptive\nfield and enabling each frame's restoration to leverage information from\ndistant frames. Extensive experiments demonstrate that the proposed method\nachieves state-of-the-art performance in All-In-One VR.\n","authors":["Yuanshuo Cheng","Mingwen Shao","Yecong Wan","Lixu Zhang","Wangmeng Zuo","Deyu Meng"],"pdf_url":"https://arxiv.org/pdf/2309.01627v2.pdf","comment":"16 pages, 13 figures"},{"id":"http://arxiv.org/abs/2309.03652v1","updated":"2023-09-07T11:46:59Z","published":"2023-09-07T11:46:59Z","title":"Anatomy-informed Data Augmentation for Enhanced Prostate Cancer\n Detection","summary":" Data augmentation (DA) is a key factor in medical image analysis, such as in\nprostate cancer (PCa) detection on magnetic resonance images. State-of-the-art\ncomputer-aided diagnosis systems still rely on simplistic spatial\ntransformations to preserve the pathological label post transformation.\nHowever, such augmentations do not substantially increase the organ as well as\ntumor shape variability in the training set, limiting the model's ability to\ngeneralize to unseen cases with more diverse localized soft-tissue\ndeformations. We propose a new anatomy-informed transformation that leverages\ninformation from adjacent organs to simulate typical physiological deformations\nof the prostate and generates unique lesion shapes without altering their\nlabel. Due to its lightweight computational requirements, it can be easily\nintegrated into common DA frameworks. We demonstrate the effectiveness of our\naugmentation on a dataset of 774 biopsy-confirmed examinations, by evaluating a\nstate-of-the-art method for PCa detection with different augmentation settings.\n","authors":["Balint Kovacs","Nils Netzer","Michael Baumgartner","Carolin Eith","Dimitrios Bounias","Clara Meinzer","Paul F. Jaeger","Kevin S. Zhang","Ralf Floca","Adrian Schrader","Fabian Isensee","Regula Gnirs","Magdalena Goertz","Viktoria Schuetz","Albrecht Stenzinger","Markus Hohenfellner","Heinz-Peter Schlemmer","Ivo Wolf","David Bonekamp","Klaus H. Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2309.03652v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2203.03624v4","updated":"2023-09-07T11:23:51Z","published":"2022-03-05T09:06:29Z","title":"FCNet: A Convolutional Neural Network for Arbitrary-Length Exposure\n Estimation","summary":" The photographs captured by digital cameras usually suffer from over or under\nexposure problems. For image exposure enhancement, the tasks of Single-Exposure\nCorrection (SEC) and Multi-Exposure Fusion (MEF) are widely studied in the\nimage processing community. However, current SEC or MEF methods are developed\nunder different motivations and thus ignore the internal correlation between\nSEC and MEF, making it difficult to process arbitrary-length sequences with\nimproper exposures. Besides, the MEF methods usually fail at estimating the\nexposure of a sequence containing only under-exposed or over-exposed images. To\nalleviate these problems, in this paper, we develop a novel Fusion-Correction\nNetwork (FCNet) to tackle an arbitrary-length (including one) image sequence\nwith improper exposures. This is achieved by fusing and correcting an image\nsequence by Laplacian Pyramid (LP) image decomposition. In each LP level, the\nlow-frequency base component of the input image sequence is fed into a Fusion\nblock and a Correction block sequentially for consecutive exposure estimation,\nimplemented by alternative exposure fusion and correction. The\nexposure-corrected image in current LP level is upsampled and fused with the\nhigh-frequency detail components of the input image sequence in the next LP\nlevel, to output the base component for the Fusion and Correction blocks in\nnext LP level. Experiments on the benchmark dataset demonstrate that our FCNet\nis effective on arbitrary-length exposure estimation, including both SEC and\nMEF. The code is publicly released at https://github.com/NKUJinLiang/FCNet.\n","authors":["Jin Liang","Yuchen Yang","Anran Zhang","Jun Xu","Hui Li","Xiantong Zhen"],"pdf_url":"https://arxiv.org/pdf/2203.03624v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03641v1","updated":"2023-09-07T11:21:10Z","published":"2023-09-07T11:21:10Z","title":"Spiking Structured State Space Model for Monaural Speech Enhancement","summary":" Speech enhancement seeks to extract clean speech from noisy signals.\nTraditional deep learning methods face two challenges: efficiently using\ninformation in long speech sequences and high computational costs. To address\nthese, we introduce the Spiking Structured State Space Model (Spiking-S4). This\napproach merges the energy efficiency of Spiking Neural Networks (SNN) with the\nlong-range sequence modeling capabilities of Structured State Space Models\n(S4), offering a compelling solution. Evaluation on the DNS Challenge and\nVoiceBank+Demand Datasets confirms that Spiking-S4 rivals existing Artificial\nNeural Network (ANN) methods but with fewer computational resources, as\nevidenced by reduced parameters and Floating Point Operations (FLOPs).\n","authors":["Yu Du","Xu Liu","Yansong Chua"],"pdf_url":"https://arxiv.org/pdf/2309.03641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03640v1","updated":"2023-09-07T11:14:02Z","published":"2023-09-07T11:14:02Z","title":"Context-Aware 3D Object Localization from Single Calibrated Images: A\n Study of Basketballs","summary":" Accurately localizing objects in three dimensions (3D) is crucial for various\ncomputer vision applications, such as robotics, autonomous driving, and\naugmented reality. This task finds another important application in sports\nanalytics and, in this work, we present a novel method for 3D basketball\nlocalization from a single calibrated image. Our approach predicts the object's\nheight in pixels in image space by estimating its projection onto the ground\nplane within the image, leveraging the image itself and the object's location\nas inputs. The 3D coordinates of the ball are then reconstructed by exploiting\nthe known projection matrix. Extensive experiments on the public DeepSport\ndataset, which provides ground truth annotations for 3D ball location alongside\ncamera calibration information for each image, demonstrate the effectiveness of\nour method, offering substantial accuracy improvements compared to recent work.\nOur work opens up new possibilities for enhanced ball tracking and\nunderstanding, advancing computer vision in diverse domains. The source code of\nthis work is made publicly available at\n\\url{https://github.com/gabriel-vanzandycke/deepsport}.\n","authors":["Marcello Davide Caio","Gabriel Van Zandycke","Christophe De Vleeschouwer"],"pdf_url":"https://arxiv.org/pdf/2309.03640v1.pdf","comment":"5 pages, 4 figures, MMSports'23, in proceedings of the 6th\n International Workshop on Multimedia Content Analysis in Sports (MMSports\n '23), October 29, 2023, Ottawa, ON, Canada"},{"id":"http://arxiv.org/abs/2303.07898v3","updated":"2023-09-07T11:07:07Z","published":"2023-03-14T13:36:36Z","title":"ISLE: A Framework for Image Level Semantic Segmentation Ensemble","summary":" One key bottleneck of employing state-of-the-art semantic segmentation\nnetworks in the real world is the availability of training labels. Conventional\nsemantic segmentation networks require massive pixel-wise annotated labels to\nreach state-of-the-art prediction quality. Hence, several works focus on\nsemantic segmentation networks trained with only image-level annotations.\nHowever, when scrutinizing the results of state-of-the-art in more detail, we\nnotice that they are remarkably close to each other on average prediction\nquality, different approaches perform better in different classes while\nproviding low quality in others. To address this problem, we propose a novel\nframework, ISLE, which employs an ensemble of the \"pseudo-labels\" for a given\nset of different semantic segmentation techniques on a class-wise level.\nPseudo-labels are the pixel-wise predictions of the image-level semantic\nsegmentation frameworks used to train the final segmentation model. Our\npseudo-labels seamlessly combine the strong points of multiple segmentation\ntechniques approaches to reach superior prediction quality. We reach up to 2.4%\nimprovement over ISLE's individual components. An exhaustive analysis was\nperformed to demonstrate ISLE's effectiveness over state-of-the-art frameworks\nfor image-level semantic segmentation.\n","authors":["Erik Ostrowski","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2303.07898v3.pdf","comment":"Accepted for Publication at the International Symposium on Visual\n Computing (ISVC), October 2023, Lake Tahoe, NV, USA"},{"id":"http://arxiv.org/abs/2303.07853v2","updated":"2023-09-07T11:04:27Z","published":"2023-03-14T12:46:52Z","title":"ReFit: A Framework for Refinement of Weakly Supervised Semantic\n Segmentation using Object Border Fitting for Medical Images","summary":" Weakly Supervised Semantic Segmentation (WSSS) relying only on image-level\nsupervision is a promising approach to deal with the need for Segmentation\nnetworks, especially for generating a large number of pixel-wise masks in a\ngiven dataset. However, most state-of-the-art image-level WSSS techniques lack\nan understanding of the geometric features embedded in the images since the\nnetwork cannot derive any object boundary information from just image-level\nlabels. We define a boundary here as the line separating an object and its\nbackground, or two different objects. To address this drawback, we are\nproposing our novel ReFit framework, which deploys state-of-the-art class\nactivation maps combined with various post-processing techniques in order to\nachieve fine-grained higher-accuracy segmentation masks. To achieve this, we\ninvestigate a state-of-the-art unsupervised segmentation network that can be\nused to construct a boundary map, which enables ReFit to predict object\nlocations with sharper boundaries. By applying our method to WSSS predictions,\nwe achieved up to 10% improvement over the current state-of-the-art WSSS\nmethods for medical imaging. The framework is open-source, to ensure that our\nresults are reproducible, and accessible online at\nhttps://github.com/bharathprabakaran/ReFit.\n","authors":["Bharath Srinivas Prabakaran","Erik Ostrowski","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2303.07853v2.pdf","comment":"Accepted for Publication at the International Symposium on Visual\n Computing (ISVC), October 2023, Lake Tahoe, NV, USA"},{"id":"http://arxiv.org/abs/2306.12760v2","updated":"2023-09-07T10:30:10Z","published":"2023-06-22T09:34:55Z","title":"Blended-NeRF: Zero-Shot Object Generation and Blending in Existing\n Neural Radiance Fields","summary":" Editing a local region or a specific object in a 3D scene represented by a\nNeRF or consistently blending a new realistic object into the scene is\nchallenging, mainly due to the implicit nature of the scene representation. We\npresent Blended-NeRF, a robust and flexible framework for editing a specific\nregion of interest in an existing NeRF scene, based on text prompts, along with\na 3D ROI box. Our method leverages a pretrained language-image model to steer\nthe synthesis towards a user-provided text prompt, along with a 3D MLP model\ninitialized on an existing NeRF scene to generate the object and blend it into\na specified region in the original scene. We allow local editing by localizing\na 3D ROI box in the input scene, and blend the content synthesized inside the\nROI with the existing scene using a novel volumetric blending technique. To\nobtain natural looking and view-consistent results, we leverage existing and\nnew geometric priors and 3D augmentations for improving the visual fidelity of\nthe final result. We test our framework both qualitatively and quantitatively\non a variety of real 3D scenes and text prompts, demonstrating realistic\nmulti-view consistent results with much flexibility and diversity compared to\nthe baselines. Finally, we show the applicability of our framework for several\n3D editing applications, including adding new objects to a scene,\nremoving/replacing/altering existing objects, and texture conversion.\n","authors":["Ori Gordon","Omri Avrahami","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2306.12760v2.pdf","comment":"16 pages, 14 figures. Project page:\n https://www.vision.huji.ac.il/blended-nerf/"},{"id":"http://arxiv.org/abs/2307.05766v4","updated":"2023-09-07T10:00:08Z","published":"2023-07-11T19:47:05Z","title":"Rad-ReStruct: A Novel VQA Benchmark and Method for Structured Radiology\n Reporting","summary":" Radiology reporting is a crucial part of the communication between\nradiologists and other medical professionals, but it can be time-consuming and\nerror-prone. One approach to alleviate this is structured reporting, which\nsaves time and enables a more accurate evaluation than free-text reports.\nHowever, there is limited research on automating structured reporting, and no\npublic benchmark is available for evaluating and comparing different methods.\nTo close this gap, we introduce Rad-ReStruct, a new benchmark dataset that\nprovides fine-grained, hierarchically ordered annotations in the form of\nstructured reports for X-Ray images. We model the structured reporting task as\nhierarchical visual question answering (VQA) and propose hi-VQA, a novel method\nthat considers prior context in the form of previously asked questions and\nanswers for populating a structured radiology report. Our experiments show that\nhi-VQA achieves competitive performance to the state-of-the-art on the medical\nVQA benchmark VQARad while performing best among methods without\ndomain-specific vision-language pretraining and provides a strong baseline on\nRad-ReStruct. Our work represents a significant step towards the automated\npopulation of structured radiology reports and provides a valuable first\nbenchmark for future research in this area. Our dataset and code is available\nat https://github.com/ChantalMP/Rad-ReStruct.\n","authors":["Chantal Pellegrini","Matthias Keicher","Ege Özsoy","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2307.05766v4.pdf","comment":"accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.03599v1","updated":"2023-09-07T09:50:48Z","published":"2023-09-07T09:50:48Z","title":"Chasing Consistency in Text-to-3D Generation from a Single Image","summary":" Text-to-3D generation from a single-view image is a popular but challenging\ntask in 3D vision. Although numerous methods have been proposed, existing works\nstill suffer from the inconsistency issues, including 1) semantic\ninconsistency, 2) geometric inconsistency, and 3) saturation inconsistency,\nresulting in distorted, overfitted, and over-saturated generations. In light of\nthe above issues, we present Consist3D, a three-stage framework Chasing for\nsemantic-, geometric-, and saturation-Consistent Text-to-3D generation from a\nsingle image, in which the first two stages aim to learn parameterized\nconsistency tokens, and the last stage is for optimization. Specifically, the\nsemantic encoding stage learns a token independent of views and estimations,\npromoting semantic consistency and robustness. Meanwhile, the geometric\nencoding stage learns another token with comprehensive geometry and\nreconstruction constraints under novel-view estimations, reducing overfitting\nand encouraging geometric consistency. Finally, the optimization stage benefits\nfrom the semantic and geometric tokens, allowing a low classifier-free guidance\nscale and therefore preventing oversaturation. Experimental results demonstrate\nthat Consist3D produces more consistent, faithful, and photo-realistic 3D\nassets compared to previous state-of-the-art methods. Furthermore, Consist3D\nalso allows background and object editing through text prompts.\n","authors":["Yichen Ouyang","Wenhao Chai","Jiayi Ye","Dapeng Tao","Yibing Zhan","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03599v1.pdf","comment":"9 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.03598v1","updated":"2023-09-07T09:50:45Z","published":"2023-09-07T09:50:45Z","title":"Enhancing Sample Utilization through Sample Adaptive Augmentation in\n Semi-Supervised Learning","summary":" In semi-supervised learning, unlabeled samples can be utilized through\naugmentation and consistency regularization. However, we observed certain\nsamples, even undergoing strong augmentation, are still correctly classified\nwith high confidence, resulting in a loss close to zero. It indicates that\nthese samples have been already learned well and do not provide any additional\noptimization benefits to the model. We refer to these samples as ``naive\nsamples\". Unfortunately, existing SSL models overlook the characteristics of\nnaive samples, and they just apply the same learning strategy to all samples.\nTo further optimize the SSL model, we emphasize the importance of giving\nattention to naive samples and augmenting them in a more diverse manner. Sample\nadaptive augmentation (SAA) is proposed for this stated purpose and consists of\ntwo modules: 1) sample selection module; 2) sample augmentation module.\nSpecifically, the sample selection module picks out {naive samples} based on\nhistorical training information at each epoch, then the naive samples will be\naugmented in a more diverse manner in the sample augmentation module. Thanks to\nthe extreme ease of implementation of the above modules, SAA is advantageous\nfor being simple and lightweight. We add SAA on top of FixMatch and FlexMatch\nrespectively, and experiments demonstrate SAA can significantly improve the\nmodels. For example, SAA helped improve the accuracy of FixMatch from 92.50% to\n94.76% and that of FlexMatch from 95.01% to 95.31% on CIFAR-10 with 40 labels.\n","authors":["Guan Gui","Zhen Zhao","Lei Qi","Luping Zhou","Lei Wang","Yinghuan Shi"],"pdf_url":"https://arxiv.org/pdf/2309.03598v1.pdf","comment":"Accepted as International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2307.02321v2","updated":"2023-09-07T09:36:16Z","published":"2023-07-05T14:22:31Z","title":"MSViT: Dynamic Mixed-Scale Tokenization for Vision Transformers","summary":" The input tokens to Vision Transformers carry little semantic meaning as they\nare defined as regular equal-sized patches of the input image, regardless of\nits content. However, processing uniform background areas of an image should\nnot necessitate as much compute as dense, cluttered areas. To address this\nissue, we propose a dynamic mixed-scale tokenization scheme for ViT, MSViT. Our\nmethod introduces a conditional gating mechanism that selects the optimal token\nscale for every image region, such that the number of tokens is dynamically\ndetermined per input. In addition, to enhance the conditional behavior of the\ngate during training, we introduce a novel generalization of the batch-shaping\nloss. We show that our gating module is able to learn meaningful semantics\ndespite operating locally at the coarse patch-level. The proposed gating module\nis lightweight, agnostic to the choice of transformer backbone, and trained\nwithin a few epochs with little training overhead. Furthermore, in contrast to\ntoken pruning, MSViT does not lose information about the input, thus can be\nreadily applied for dense tasks. We validate MSViT on the tasks of\nclassification and segmentation where it leads to improved accuracy-complexity\ntrade-off.\n","authors":["Jakob Drachmann Havtorn","Amelie Royer","Tijmen Blankevoort","Babak Ehteshami Bejnordi"],"pdf_url":"https://arxiv.org/pdf/2307.02321v2.pdf","comment":"ICCV Workshops 2023; Code for the Generalized Batch-Shaping loss is\n available at https://github.com/Qualcomm-AI-research/batchshaping"},{"id":"http://arxiv.org/abs/2309.03590v1","updated":"2023-09-07T09:31:27Z","published":"2023-09-07T09:31:27Z","title":"Spatial encoding of BOLD fMRI time series for categorizing static images\n across visual datasets: A pilot study on human vision","summary":" Functional MRI (fMRI) is widely used to examine brain functionality by\ndetecting alteration in oxygenated blood flow that arises with brain activity.\nIn this study, complexity specific image categorization across different visual\ndatasets is performed using fMRI time series (TS) to understand differences in\nneuronal activities related to vision. Publicly available BOLD5000 dataset is\nused for this purpose, containing fMRI scans while viewing 5254 images of\ndiverse categories, drawn from three standard computer vision datasets: COCO,\nImageNet and SUN. To understand vision, it is important to study how brain\nfunctions while looking at different images. To achieve this, spatial encoding\nof fMRI BOLD TS has been performed that uses classical Gramian Angular Field\n(GAF) and Markov Transition Field (MTF) to obtain 2D BOLD TS, representing\nimages of COCO, Imagenet and SUN. For classification, individual GAF and MTF\nfeatures are fed into regular CNN. Subsequently, parallel CNN model is employed\nthat uses combined 2D features for classifying images across COCO, Imagenet and\nSUN. The result of 2D CNN models is also compared with 1D LSTM and Bi-LSTM that\nutilizes raw fMRI BOLD signal for classification. It is seen that parallel CNN\nmodel outperforms other network models with an improvement of 7% for\nmulti-class classification. Clinical relevance- The obtained result of this\nanalysis establishes a baseline in studying how differently human brain\nfunctions while looking at images of diverse complexities.\n","authors":["Vamshi K. Kancharala","Debanjali Bhattacharya","Neelam Sinha"],"pdf_url":"https://arxiv.org/pdf/2309.03590v1.pdf","comment":"This paper is accepted for publication in IEEE Region 10 Technical\n conference, TENCON 2023, to be held in Chiang Mai, Thailand from 31 October -\n 3 November, 2023"},{"id":"http://arxiv.org/abs/2203.09957v4","updated":"2023-09-07T09:21:42Z","published":"2022-03-18T13:49:25Z","title":"Enhancement of Novel View Synthesis Using Omnidirectional Image\n Completion","summary":" In this study, we present a method for synthesizing novel views from a single\n360-degree RGB-D image based on the neural radiance field (NeRF) . Prior\nstudies relied on the neighborhood interpolation capability of multi-layer\nperceptrons to complete missing regions caused by occlusion and zooming, which\nleads to artifacts. In the method proposed in this study, the input image is\nreprojected to 360-degree RGB images at other camera positions, the missing\nregions of the reprojected images are completed by a 2D image generative model,\nand the completed images are utilized to train the NeRF. Because multiple\ncompleted images contain inconsistencies in 3D, we introduce a method to learn\nthe NeRF model using a subset of completed images that cover the target scene\nwith less overlap of completed regions. The selection of such a subset of\nimages can be attributed to the maximum weight independent set problem, which\nis solved through simulated annealing. Experiments demonstrated that the\nproposed method can synthesize plausible novel views while preserving the\nfeatures of the scene for both artificial and real-world data.\n","authors":["Takayuki Hara","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2203.09957v4.pdf","comment":"20 pages, 19 figures"},{"id":"http://arxiv.org/abs/2309.03576v1","updated":"2023-09-07T09:12:02Z","published":"2023-09-07T09:12:02Z","title":"DropPos: Pre-Training Vision Transformers by Reconstructing Dropped\n Positions","summary":" As it is empirically observed that Vision Transformers (ViTs) are quite\ninsensitive to the order of input tokens, the need for an appropriate\nself-supervised pretext task that enhances the location awareness of ViTs is\nbecoming evident. To address this, we present DropPos, a novel pretext task\ndesigned to reconstruct Dropped Positions. The formulation of DropPos is\nsimple: we first drop a large random subset of positional embeddings and then\nthe model classifies the actual position for each non-overlapping patch among\nall possible positions solely based on their visual appearance. To avoid\ntrivial solutions, we increase the difficulty of this task by keeping only a\nsubset of patches visible. Additionally, considering there may be different\npatches with similar visual appearances, we propose position smoothing and\nattentive reconstruction strategies to relax this classification problem, since\nit is not necessary to reconstruct their exact positions in these cases.\nEmpirical evaluations of DropPos show strong capabilities. DropPos outperforms\nsupervised pre-training and achieves competitive results compared with\nstate-of-the-art self-supervised alternatives on a wide range of downstream\nbenchmarks. This suggests that explicitly encouraging spatial reasoning\nabilities, as DropPos does, indeed contributes to the improved location\nawareness of ViTs. The code is publicly available at\nhttps://github.com/Haochen-Wang409/DropPos.\n","authors":["Haochen Wang","Junsong Fan","Yuxi Wang","Kaiyou Song","Tong Wang","Zhaoxiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.03576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03575v1","updated":"2023-09-07T09:11:49Z","published":"2023-09-07T09:11:49Z","title":"Toward High Quality Facial Representation Learning","summary":" Face analysis tasks have a wide range of applications, but the universal\nfacial representation has only been explored in a few works. In this paper, we\nexplore high-performance pre-training methods to boost the face analysis tasks\nsuch as face alignment and face parsing. We propose a self-supervised\npre-training framework, called \\textbf{\\it Mask Contrastive Face (MCF)}, with\nmask image modeling and a contrastive strategy specially adjusted for face\ndomain tasks. To improve the facial representation quality, we use feature map\nof a pre-trained visual backbone as a supervision item and use a partially\npre-trained decoder for mask image modeling. To handle the face identity during\nthe pre-training stage, we further use random masks to build contrastive\nlearning pairs. We conduct the pre-training on the LAION-FACE-cropped dataset,\na variants of LAION-FACE 20M, which contains more than 20 million face images\nfrom Internet websites. For efficiency pre-training, we explore our framework\npre-training performance on a small part of LAION-FACE-cropped and verify the\nsuperiority with different pre-training settings. Our model pre-trained with\nthe full pre-training dataset outperforms the state-of-the-art methods on\nmultiple downstream tasks. Our model achieves 0.932 NME$_{diag}$ for AFLW-19\nface alignment and 93.96 F1 score for LaPa face parsing. Code is available at\nhttps://github.com/nomewang/MCF.\n","authors":["Yue Wang","Jinlong Peng","Jiangning Zhang","Ran Yi","Liang Liu","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03575v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2304.09479v3","updated":"2023-09-07T09:08:01Z","published":"2023-04-19T08:03:20Z","title":"DiFaReli: Diffusion Face Relighting","summary":" We present a novel approach to single-view face relighting in the wild.\nHandling non-diffuse effects, such as global illumination or cast shadows, has\nlong been a challenge in face relighting. Prior work often assumes Lambertian\nsurfaces, simplified lighting models or involves estimating 3D shape, albedo,\nor a shadow map. This estimation, however, is error-prone and requires many\ntraining examples with lighting ground truth to generalize well. Our work\nbypasses the need for accurate estimation of intrinsic components and can be\ntrained solely on 2D images without any light stage data, multi-view images, or\nlighting ground truth. Our key idea is to leverage a conditional diffusion\nimplicit model (DDIM) for decoding a disentangled light encoding along with\nother encodings related to 3D shape and facial identity inferred from\noff-the-shelf estimators. We also propose a novel conditioning technique that\neases the modeling of the complex interaction between light and geometry by\nusing a rendered shading reference to spatially modulate the DDIM. We achieve\nstate-of-the-art performance on standard benchmark Multi-PIE and can\nphotorealistically relight in-the-wild images. Please visit our page:\nhttps://diffusion-face-relighting.github.io\n","authors":["Puntawat Ponglertnapakorn","Nontawat Tritrong","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2304.09479v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10829v3","updated":"2023-09-07T09:02:58Z","published":"2023-07-10T12:18:18Z","title":"Exact Diffusion Inversion via Bi-directional Integration Approximation","summary":" Recently, various methods have been proposed to address the inconsistency\nissue of DDIM inversion to enable image editing, such as EDICT [36] and\nNull-text inversion [22]. However, the above methods introduce considerable\ncomputational overhead. In this paper, we propose a new technique, named\n\\emph{bi-directional integration approximation} (BDIA), to perform exact\ndiffusion inversion with neglible computational overhead. Suppose we would like\nto estimate the next diffusion state $\\boldsymbol{z}_{i-1}$ at timestep $t_i$\nwith the historical information $(i,\\boldsymbol{z}_i)$ and\n$(i+1,\\boldsymbol{z}_{i+1})$. We first obtain the estimated Gaussian noise\n$\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i)$, and then apply the DDIM\nupdate procedure twice for approximating the ODE integration over the next\ntime-slot $[t_i, t_{i-1}]$ in the forward manner and the previous time-slot\n$[t_i, t_{t+1}]$ in the backward manner. The DDIM step for the previous\ntime-slot is used to refine the integration approximation made earlier when\ncomputing $\\boldsymbol{z}_i$. A nice property of BDIA-DDIM is that the update\nexpression for $\\boldsymbol{z}_{i-1}$ is a linear combination of\n$(\\boldsymbol{z}_{i+1}, \\boldsymbol{z}_i,\n\\hat{\\boldsymbol{\\epsilon}}(\\boldsymbol{z}_i,i))$. This allows for exact\nbackward computation of $\\boldsymbol{z}_{i+1}$ given $(\\boldsymbol{z}_i,\n\\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. It is\ndemonstrated with experiments that (round-trip) BDIA-DDIM is particularly\neffective for image editing. Our experiments further show that BDIA-DDIM\nproduces markedly better image sampling qualities than DDIM for text-to-image\ngeneration.\n BDIA can also be applied to improve the performance of other ODE solvers in\naddition to DDIM. In our work, it is found that applying BDIA to the EDM\nsampling procedure produces new SOTA performance over CIFAR10.\n","authors":["Guoqiang Zhang","J. P. Lewis","W. Bastiaan Kleijn"],"pdf_url":"https://arxiv.org/pdf/2307.10829v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.11328"},{"id":"http://arxiv.org/abs/2308.08197v3","updated":"2023-09-07T09:01:44Z","published":"2023-08-16T07:57:35Z","title":"Self-Reference Deep Adaptive Curve Estimation for Low-Light Image\n Enhancement","summary":" In this paper, we propose a 2-stage low-light image enhancement method called\nSelf-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage,\nwe present an intuitive, lightweight, fast, and unsupervised luminance\nenhancement algorithm. The algorithm is based on a novel low-light enhancement\ncurve that can be used to locally boost image brightness. We also propose a new\nloss function with a simplified physical model designed to preserve natural\nimages' color, structure, and fidelity. We use a vanilla CNN to map each pixel\nthrough deep Adaptive Adjustment Curves (AAC) while preserving the local image\nstructure. Secondly, we introduce the corresponding denoising scheme to remove\nthe latent noise in the darkness. We approximately model the noise in the dark\nand deploy a Denoising-Net to estimate and remove the noise after the first\nstage. Exhaustive qualitative and quantitative analysis shows that our method\noutperforms existing state-of-the-art algorithms on multiple real-world\ndatasets.\n","authors":["Jianyu Wen","Chenhao Wu","Tong Zhang","Yixuan Yu","Piotr Swierczynski"],"pdf_url":"https://arxiv.org/pdf/2308.08197v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03569v1","updated":"2023-09-07T08:58:41Z","published":"2023-09-07T08:58:41Z","title":"Sparse Federated Training of Object Detection in the Internet of\n Vehicles","summary":" As an essential component part of the Intelligent Transportation System\n(ITS), the Internet of Vehicles (IoV) plays a vital role in alleviating traffic\nissues. Object detection is one of the key technologies in the IoV, which has\nbeen widely used to provide traffic management services by analyzing timely and\nsensitive vehicle-related information. However, the current object detection\nmethods are mostly based on centralized deep training, that is, the sensitive\ndata obtained by edge devices need to be uploaded to the server, which raises\nprivacy concerns. To mitigate such privacy leakage, we first propose a\nfederated learning-based framework, where well-trained local models are shared\nin the central server. However, since edge devices usually have limited\ncomputing power, plus a strict requirement of low latency in IoVs, we further\npropose a sparse training process on edge devices, which can effectively\nlighten the model, and ensure its training efficiency on edge devices, thereby\nreducing communication overheads. In addition, due to the diverse computing\ncapabilities and dynamic environment, different sparsity rates are applied to\nedge devices. To further guarantee the performance, we propose, FedWeg, an\nimproved aggregation scheme based on FedAvg, which is designed by the inverse\nratio of sparsity rates. Experiments on the real-life dataset using YOLO show\nthat the proposed scheme can achieve the required object detection rate while\nsaving considerable communication costs.\n","authors":["Luping Rao","Chuan Ma","Ming Ding","Yuwen Qian","Lu Zhou","Zhe Liu"],"pdf_url":"https://arxiv.org/pdf/2309.03569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.15201v3","updated":"2023-09-07T08:54:54Z","published":"2022-03-29T02:43:40Z","title":"Light Field Depth Estimation via Stitched Epipolar Plane Images","summary":" Depth estimation is a fundamental problem in light field processing.\nEpipolar-plane image (EPI)-based methods often encounter challenges such as low\naccuracy in slope computation due to discretization errors and limited angular\nresolution. Besides, existing methods perform well in most regions but struggle\nto produce sharp edges in occluded regions and resolve ambiguities in\ntexture-less regions. To address these issues, we propose the concept of\nstitched-EPI (SEPI) to enhance slope computation. SEPI achieves this by\nshifting and concatenating lines from different EPIs that correspond to the\nsame 3D point. Moreover, we introduce the half-SEPI algorithm, which focuses\nexclusively on the non-occluded portion of lines to handle occlusion.\nAdditionally, we present a depth propagation strategy aimed at improving depth\nestimation in texture-less regions. This strategy involves determining the\ndepth of such regions by progressing from the edges towards the interior,\nprioritizing accurate regions over coarse regions. Through extensive\nexperimental evaluations and ablation studies, we validate the effectiveness of\nour proposed method. The results demonstrate its superior ability to generate\nmore accurate and robust depth maps across all regions compared to\nstate-of-the-art methods. The source code will be publicly available at\nhttps://github.com/PingZhou-LF/Light-Field-Depth-Estimation-Based-on-Stitched-EPIs.\n","authors":["Ping Zhou","Langqing Shi","Xiaoyang Liu","Jing Jin","Yuting Zhang","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2203.15201v3.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2309.03558v1","updated":"2023-09-07T08:41:47Z","published":"2023-09-07T08:41:47Z","title":"Region Generation and Assessment Network for Occluded Person\n Re-Identification","summary":" Person Re-identification (ReID) plays a more and more crucial role in recent\nyears with a wide range of applications. Existing ReID methods are suffering\nfrom the challenges of misalignment and occlusions, which degrade the\nperformance dramatically. Most methods tackle such challenges by utilizing\nexternal tools to locate body parts or exploiting matching strategies.\nNevertheless, the inevitable domain gap between the datasets utilized for\nexternal tools and the ReID datasets and the complicated matching process make\nthese methods unreliable and sensitive to noises. In this paper, we propose a\nRegion Generation and Assessment Network (RGANet) to effectively and\nefficiently detect the human body regions and highlight the important regions.\nIn the proposed RGANet, we first devise a Region Generation Module (RGM) which\nutilizes the pre-trained CLIP to locate the human body regions using semantic\nprototypes extracted from text descriptions. Learnable prompt is designed to\neliminate domain gap between CLIP datasets and ReID datasets. Then, to measure\nthe importance of each generated region, we introduce a Region Assessment\nModule (RAM) that assigns confidence scores to different regions and reduces\nthe negative impact of the occlusion regions by lower scores. The RAM consists\nof a discrimination-aware indicator and an invariance-aware indicator, where\nthe former indicates the capability to distinguish from different identities\nand the latter represents consistency among the images of the same class of\nhuman body regions. Extensive experimental results for six widely-used\nbenchmarks including three tasks (occluded, partial, and holistic) demonstrate\nthe superiority of RGANet against state-of-the-art methods.\n","authors":["Shuting He","Weihua Chen","Kai Wang","Hao Luo","Fan Wang","Wei Jiang","Henghui Ding"],"pdf_url":"https://arxiv.org/pdf/2309.03558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00526v4","updated":"2023-09-07T08:40:16Z","published":"2023-06-01T10:28:12Z","title":"Layout and Task Aware Instruction Prompt for Zero-shot Document Image\n Question Answering","summary":" Layout-aware pre-trained models has achieved significant progress on document\nimage question answering. They introduce extra learnable modules into existing\nlanguage models to capture layout information within document images from text\nbounding box coordinates obtained by OCR tools. However, extra modules\nnecessitate pre-training on extensive document images. This prevents these\nmethods from directly utilizing off-the-shelf instruction-tuning language\nfoundation models, which have recently shown promising potential in zero-shot\nlearning. Instead, in this paper, we find that instruction-tuning language\nmodels like Claude and ChatGPT can understand layout by spaces and line breaks.\nBased on this observation, we propose the LAyout and Task aware Instruction\nPrompt (LATIN-Prompt), which consists of layout-aware document content and\ntask-aware instruction. Specifically, the former uses appropriate spaces and\nline breaks to recover the layout information among text segments obtained by\nOCR tools, and the latter ensures that generated answers adhere to formatting\nrequirements. Moreover, we propose the LAyout and Task aware Instruction Tuning\n(LATIN-Tuning) to improve the performance of small instruction-tuning models\nlike Alpaca. Experimental results show that LATIN-Prompt enables zero-shot\nperformance of Claude and ChatGPT to be comparable to the fine-tuning\nperformance of SOTAs on document image question answering, and LATIN-Tuning\nenhances the zero-shot performance of Alpaca significantly. For example,\nLATIN-Prompt improves the performance of Claude and ChatGPT on DocVQA by 263%\nand 20% respectively. LATIN-Tuning improves the performance of Alpaca on DocVQA\nby 87.7%. Quantitative and qualitative analyses demonstrate the effectiveness\nof LATIN-Prompt and LATIN-Tuning. We provide the code in supplementary and will\nrelease it to facilitate future research.\n","authors":["Wenjin Wang","Yunhao Li","Yixin Ou","Yin Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.00526v4.pdf","comment":"Add the LATIN-Tuning for Alapca. Code is available at\n https://github.com/WenjinW/LATIN-Prompt"},{"id":"http://arxiv.org/abs/2303.00601v2","updated":"2023-09-07T08:28:39Z","published":"2023-03-01T15:48:27Z","title":"Multimodal Industrial Anomaly Detection via Hybrid Fusion","summary":" 2D-based Industrial Anomaly Detection has been widely discussed, however,\nmultimodal industrial anomaly detection based on 3D point clouds and RGB images\nstill has many untouched fields. Existing multimodal industrial anomaly\ndetection methods directly concatenate the multimodal features, which leads to\na strong disturbance between features and harms the detection performance. In\nthis paper, we propose Multi-3D-Memory (M3DM), a novel multimodal anomaly\ndetection method with hybrid fusion scheme: firstly, we design an unsupervised\nfeature fusion with patch-wise contrastive learning to encourage the\ninteraction of different modal features; secondly, we use a decision layer\nfusion with multiple memory banks to avoid loss of information and additional\nnovelty classifiers to make the final decision. We further propose a point\nfeature alignment operation to better align the point cloud and RGB features.\nExtensive experiments show that our multimodal industrial anomaly detection\nmodel outperforms the state-of-the-art (SOTA) methods on both detection and\nsegmentation precision on MVTec-3D AD dataset. Code is available at\nhttps://github.com/nomewang/M3DM.\n","authors":["Yue Wang","Jinlong Peng","Jiangning Zhang","Ran Yi","Yabiao Wang","Chengjie Wang"],"pdf_url":"https://arxiv.org/pdf/2303.00601v2.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2207.03367v4","updated":"2023-09-07T08:19:44Z","published":"2022-07-07T15:16:36Z","title":"Joint Super-Resolution and Inverse Tone-Mapping: A Feature Decomposition\n Aggregation Network and A New Benchmark","summary":" Joint Super-Resolution and Inverse Tone-Mapping (joint SR-ITM) aims to\nincrease the resolution and dynamic range of low-resolution and standard\ndynamic range images. Recent networks mainly resort to image decomposition\ntechniques with complex multi-branch architectures. However, the fixed\ndecomposition techniques would largely restricts their power on versatile\nimages. To exploit the potential power of decomposition mechanism, in this\npaper, we generalize it from the image domain to the broader feature domain. To\nthis end, we propose a lightweight Feature Decomposition Aggregation Network\n(FDAN). In particular, we design a Feature Decomposition Block (FDB) to achieve\nlearnable separation of detail and base feature maps, and develop a\nHierarchical Feature Decomposition Group by cascading FDBs for powerful\nmulti-level feature decomposition. Moreover, to better evaluate the comparison\nmethods, we collect a large-scale dataset for joint SR-ITM, i.e., SRITM-4K,\nwhich provides versatile scenarios for robust model training and evaluation.\nExperimental results on two benchmark datasets demonstrate that our FDAN is\nefficient and outperforms state-of-the-art methods on joint SR-ITM. The code of\nour FDAN and the SRITM-4K dataset are available at\nhttps://github.com/CS-GangXu/FDAN.\n","authors":["Gang Xu","Yu-chen Yang","Liang Wang","Xian-Tong Zhen","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2207.03367v4.pdf","comment":"update the authors info and the article template"},{"id":"http://arxiv.org/abs/2309.03550v1","updated":"2023-09-07T08:14:46Z","published":"2023-09-07T08:14:46Z","title":"Text2Control3D: Controllable 3D Avatar Generation in Neural Radiance\n Fields using Geometry-Guided Text-to-Image Diffusion Model","summary":" Recent advances in diffusion models such as ControlNet have enabled\ngeometrically controllable, high-fidelity text-to-image generation. However,\nnone of them addresses the question of adding such controllability to\ntext-to-3D generation. In response, we propose Text2Control3D, a controllable\ntext-to-3D avatar generation method whose facial expression is controllable\ngiven a monocular video casually captured with hand-held camera. Our main\nstrategy is to construct the 3D avatar in Neural Radiance Fields (NeRF)\noptimized with a set of controlled viewpoint-aware images that we generate from\nControlNet, whose condition input is the depth map extracted from the input\nvideo. When generating the viewpoint-aware images, we utilize cross-reference\nattention to inject well-controlled, referential facial expression and\nappearance via cross attention. We also conduct low-pass filtering of Gaussian\nlatent of the diffusion model in order to ameliorate the viewpoint-agnostic\ntexture problem we observed from our empirical analysis, where the\nviewpoint-aware images contain identical textures on identical pixel positions\nthat are incomprehensible in 3D. Finally, to train NeRF with the images that\nare viewpoint-aware yet are not strictly consistent in geometry, our approach\nconsiders per-image geometric variation as a view of deformation from a shared\n3D canonical space. Consequently, we construct the 3D avatar in a canonical\nspace of deformable NeRF by learning a set of per-image deformation via\ndeformation field table. We demonstrate the empirical results and discuss the\neffectiveness of our method.\n","authors":["Sungwon Hwang","Junha Hyung","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2309.03550v1.pdf","comment":"Project page: https://text2control3d.github.io/"},{"id":"http://arxiv.org/abs/2305.01905v2","updated":"2023-09-07T08:13:58Z","published":"2023-05-03T05:39:12Z","title":"Localization using Multi-Focal Spatial Attention for Masked Face\n Recognition","summary":" Since the beginning of world-wide COVID-19 pandemic, facial masks have been\nrecommended to limit the spread of the disease. However, these masks hide\ncertain facial attributes. Hence, it has become difficult for existing face\nrecognition systems to perform identity verification on masked faces. In this\ncontext, it is necessary to develop masked Face Recognition (MFR) for\ncontactless biometric recognition systems. Thus, in this paper, we propose\nComplementary Attention Learning and Multi-Focal Spatial Attention that\nprecisely removes masked region by training complementary spatial attention to\nfocus on two distinct regions: masked regions and backgrounds. In our method,\nstandard spatial attention and networks focus on unmasked regions, and extract\nmask-invariant features while minimizing the loss of the conventional Face\nRecognition (FR) performance. For conventional FR, we evaluate the performance\non the IJB-C, Age-DB, CALFW, and CPLFW datasets. We evaluate the MFR\nperformance on the ICCV2021-MFR/Insightface track, and demonstrate the improved\nperformance on the both MFR and FR datasets. Additionally, we empirically\nverify that spatial attention of proposed method is more precisely activated in\nunmasked regions.\n","authors":["Yooshin Cho","Hanbyel Cho","Hyeong Gwon Hong","Jaesung Ahn","Dongmin Cho","JungWoo Chang","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2305.01905v2.pdf","comment":"Accepted at FG 2023 - InterID Workshop"},{"id":"http://arxiv.org/abs/2309.03549v1","updated":"2023-09-07T08:12:58Z","published":"2023-09-07T08:12:58Z","title":"Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation","summary":" Inspired by the remarkable success of Latent Diffusion Models (LDMs) for\nimage synthesis, we study LDM for text-to-video generation, which is a\nformidable challenge due to the computational and memory constraints during\nboth model training and inference. A single LDM is usually only capable of\ngenerating a very limited number of video frames. Some existing works focus on\nseparate prediction models for generating more video frames, which suffer from\nadditional training cost and frame-level jittering, however. In this paper, we\npropose a framework called \"Reuse and Diffuse\" dubbed $\\textit{VidRD}$ to\nproduce more frames following the frames already generated by an LDM.\nConditioned on an initial video clip with a small number of frames, additional\nframes are iteratively generated by reusing the original latent features and\nfollowing the previous diffusion process. Besides, for the autoencoder used for\ntranslation between pixel space and latent space, we inject temporal layers\ninto its decoder and fine-tune these layers for higher temporal consistency. We\nalso propose a set of strategies for composing video-text data that involve\ndiverse content from multiple existing datasets including video datasets for\naction recognition and image-text datasets. Extensive experiments show that our\nmethod achieves good results in both quantitative and qualitative evaluations.\nOur project page is available\n$\\href{https://anonymous0x233.github.io/ReuseAndDiffuse/}{here}$.\n","authors":["Jiaxi Gu","Shicong Wang","Haoyu Zhao","Tianyi Lu","Xing Zhang","Zuxuan Wu","Songcen Xu","Wei Zhang","Yu-Gang Jiang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2309.03549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03548v1","updated":"2023-09-07T08:11:47Z","published":"2023-09-07T08:11:47Z","title":"Trash to Treasure: Low-Light Object Detection via\n Decomposition-and-Aggregation","summary":" Object detection in low-light scenarios has attracted much attention in the\npast few years. A mainstream and representative scheme introduces enhancers as\nthe pre-processing for regular detectors. However, because of the disparity in\ntask objectives between the enhancer and detector, this paradigm cannot shine\nat its best ability. In this work, we try to arouse the potential of enhancer +\ndetector. Different from existing works, we extend the illumination-based\nenhancers (our newly designed or existing) as a scene decomposition module,\nwhose removed illumination is exploited as the auxiliary in the detector for\nextracting detection-friendly features. A semantic aggregation module is\nfurther established for integrating multi-scale scene-related semantic\ninformation in the context space. Actually, our built scheme successfully\ntransforms the \"trash\" (i.e., the ignored illumination in the detector) into\nthe \"treasure\" for the detector. Plenty of experiments are conducted to reveal\nour superiority against other state-of-the-art methods. The code will be public\nif it is accepted.\n","authors":["Xiaohan Cui","Long Ma","Tengyu Ma","Jinyuan Liu","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2309.03548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00398v2","updated":"2023-09-07T08:11:01Z","published":"2023-09-01T11:14:43Z","title":"VideoGen: A Reference-Guided Latent Diffusion Approach for High\n Definition Text-to-Video Generation","summary":" In this paper, we present VideoGen, a text-to-video generation approach,\nwhich can generate a high-definition video with high frame fidelity and strong\ntemporal consistency using reference-guided latent diffusion. We leverage an\noff-the-shelf text-to-image generation model, e.g., Stable Diffusion, to\ngenerate an image with high content quality from the text prompt, as a\nreference image to guide video generation. Then, we introduce an efficient\ncascaded latent diffusion module conditioned on both the reference image and\nthe text prompt, for generating latent video representations, followed by a\nflow-based temporal upsampling step to improve the temporal resolution.\nFinally, we map latent video representations into a high-definition video\nthrough an enhanced video decoder. During training, we use the first frame of a\nground-truth video as the reference image for training the cascaded latent\ndiffusion module. The main characterises of our approach include: the reference\nimage generated by the text-to-image model improves the visual fidelity; using\nit as the condition makes the diffusion model focus more on learning the video\ndynamics; and the video decoder is trained over unlabeled video data, thus\nbenefiting from high-quality easily-available videos. VideoGen sets a new\nstate-of-the-art in text-to-video generation in terms of both qualitative and\nquantitative evaluation. See \\url{https://videogen.github.io/VideoGen/} for\nmore samples.\n","authors":["Xin Li","Wenqing Chu","Ye Wu","Weihang Yuan","Fanglong Liu","Qi Zhang","Fu Li","Haocheng Feng","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00398v2.pdf","comment":"8pages, 8figures, project page: https://videogen.github.io/VideoGen/"},{"id":"http://arxiv.org/abs/2309.03542v1","updated":"2023-09-07T08:01:07Z","published":"2023-09-07T08:01:07Z","title":"Zero-Shot Scene Graph Generation via Triplet Calibration and Reduction","summary":" Scene Graph Generation (SGG) plays a pivotal role in downstream\nvision-language tasks. Existing SGG methods typically suffer from poor\ncompositional generalizations on unseen triplets. They are generally trained on\nincompletely annotated scene graphs that contain dominant triplets and tend to\nbias toward these seen triplets during inference. To address this issue, we\npropose a Triplet Calibration and Reduction (T-CAR) framework in this paper. In\nour framework, a triplet calibration loss is first presented to regularize the\nrepresentations of diverse triplets and to simultaneously excavate the unseen\ntriplets in incompletely annotated training scene graphs. Moreover, the unseen\nspace of scene graphs is usually several times larger than the seen space since\nit contains a huge number of unrealistic compositions. Thus, we propose an\nunseen space reduction loss to shift the attention of excavation to reasonable\nunseen compositions to facilitate the model training. Finally, we propose a\ncontextual encoder to improve the compositional generalizations of unseen\ntriplets by explicitly modeling the relative spatial relations between subjects\nand objects. Extensive experiments show that our approach achieves consistent\nimprovements for zero-shot SGG over state-of-the-art methods. The code is\navailable at https://github.com/jkli1998/T-CAR.\n","authors":["Jiankai Li","Yunhong Wang","Weixin Li"],"pdf_url":"https://arxiv.org/pdf/2309.03542v1.pdf","comment":"Accept in TOMM 2023"},{"id":"http://arxiv.org/abs/2309.03539v1","updated":"2023-09-07T07:52:57Z","published":"2023-09-07T07:52:57Z","title":"YOLO series target detection algorithms for underwater environments","summary":" You Only Look Once (YOLO) algorithm is a representative target detection\nalgorithm emerging in 2016, which is known for its balance of computing speed\nand accuracy, and now plays an important role in various fields of human\nproduction and life. However, there are still many limitations in the\napplication of YOLO algorithm in underwater environments due to problems such\nas dim light and turbid water. With limited land area resources, the ocean must\nhave great potential for future human development. In this paper, starting from\nthe actual needs of marine engineering applications, taking underwater\nstructural health monitoring (SHM) and underwater biological detection as\nexamples, we propose improved methods for the application of underwater YOLO\nalgorithms, and point out the problems that still exist.\n","authors":["Chenjie Zhang","Pengcheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2309.03539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03535v1","updated":"2023-09-07T07:46:46Z","published":"2023-09-07T07:46:46Z","title":"Feature Enhancer Segmentation Network (FES-Net) for Vessel Segmentation","summary":" Diseases such as diabetic retinopathy and age-related macular degeneration\npose a significant risk to vision, highlighting the importance of precise\nsegmentation of retinal vessels for the tracking and diagnosis of progression.\nHowever, existing vessel segmentation methods that heavily rely on\nencoder-decoder structures struggle to capture contextual information about\nretinal vessel configurations, leading to challenges in reconciling semantic\ndisparities between encoder and decoder features. To address this, we propose a\nnovel feature enhancement segmentation network (FES-Net) that achieves accurate\npixel-wise segmentation without requiring additional image enhancement steps.\nFES-Net directly processes the input image and utilizes four prompt\nconvolutional blocks (PCBs) during downsampling, complemented by a shallow\nupsampling approach to generate a binary mask for each class. We evaluate the\nperformance of FES-Net on four publicly available state-of-the-art datasets:\nDRIVE, STARE, CHASE, and HRF. The evaluation results clearly demonstrate the\nsuperior performance of FES-Net compared to other competitive approaches\ndocumented in the existing literature.\n","authors":["Tariq M. Khan","Muhammad Arsalan","Shahzaib Iqbal","Imran Razzak","Erik Meijering"],"pdf_url":"https://arxiv.org/pdf/2309.03535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02335v2","updated":"2023-09-07T07:46:16Z","published":"2023-08-04T14:06:44Z","title":"RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph\n Classification","summary":" Graph classification is a crucial task in many real-world multimedia\napplications, where graphs can represent various multimedia data types such as\nimages, videos, and social networks. Previous efforts have applied graph neural\nnetworks (GNNs) in balanced situations where the class distribution is\nbalanced. However, real-world data typically exhibit long-tailed class\ndistributions, resulting in a bias towards the head classes when using GNNs and\nlimited generalization ability over the tail classes. Recent approaches mainly\nfocus on re-balancing different classes during model training, which fails to\nexplicitly introduce new knowledge and sacrifices the performance of the head\nclasses. To address these drawbacks, we propose a novel framework called\nRetrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature\nextractor and an unbiased classifier in a decoupled manner. In the feature\nextractor training stage, we develop a graph retrieval module to search for\nrelevant graphs that directly enrich the intra-class diversity for the tail\nclasses. Moreover, we innovatively optimize a category-centered supervised\ncontrastive loss to obtain discriminative representations, which is more\nsuitable for long-tailed scenarios. In the classifier fine-tuning stage, we\nbalance the classifier weights with two weight regularization techniques, i.e.,\nMax-norm and weight decay. Experiments on various popular benchmarks verify the\nsuperiority of the proposed method against state-of-the-art approaches.\n","authors":["Zhengyang Mao","Wei Ju","Yifang Qin","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02335v2.pdf","comment":"Accepted by the ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2306.15955v3","updated":"2023-09-07T07:34:21Z","published":"2023-06-28T06:37:03Z","title":"Understanding Prompt Tuning for V-L Models Through the Lens of Neural\n Collapse","summary":" Large-scale vision-language (V-L) models have demonstrated remarkable\ngeneralization capabilities for downstream tasks through prompt tuning.\nHowever, the mechanisms behind the learned text representations are unknown,\nlimiting further generalization gains, especially under class imbalance\nscenarios. Recent advances in the neural collapse (NC) phenomenon of\nvision-only models suggest that the optimal representation structure is the\nsimplex ETF, which paves the way to study representations in V-L models. In\nthis paper, we make the first attempt to use NC for examining the\nrepresentations in V-L models via prompt tuning. It is found that NC optimality\nof text-to-image representations shows a positive correlation with downstream\ngeneralizability, which is more severe under class imbalance settings. To\nimprove the representations, we propose Neural-collapse-anchored Prompt Tuning\n(NPT), a novel method that learns prompts with text and image representations\nthat satisfy the same simplex ETF. NPT incorporates two regularization terms:\nlanguage-modality collapse and multi-modality isomorphism; and it is compatible\nwith other prompt tuning methods. Extensive experiments show that NPT can\nconsistently help to improve existing prompt tuning techniques across 11\ndatasets for both balanced and imbalanced settings.\n","authors":["Didi Zhu","Zexi Li","Min Zhang","Junkun Yuan","Yunfeng Shao","Jiashuo Liu","Kun Kuang","Yinchuan Li","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2306.15955v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03531v1","updated":"2023-09-07T07:26:27Z","published":"2023-09-07T07:26:27Z","title":"A Robust Negative Learning Approach to Partial Domain Adaptation Using\n Source Prototypes","summary":" This work proposes a robust Partial Domain Adaptation (PDA) framework that\nmitigates the negative transfer problem by incorporating a robust\ntarget-supervision strategy. It leverages ensemble learning and includes\ndiverse, complementary label feedback, alleviating the effect of incorrect\nfeedback and promoting pseudo-label refinement. Rather than relying exclusively\non first-order moments for distribution alignment, our approach offers explicit\nobjectives to optimize intra-class compactness and inter-class separation with\nthe inferred source prototypes and highly-confident target samples in a\ndomain-invariant fashion. Notably, we ensure source data privacy by eliminating\nthe need to access the source data during the adaptation phase through a priori\ninference of source prototypes. We conducted a series of comprehensive\nexperiments, including an ablation analysis, covering a range of partial domain\nadaptation tasks. Comprehensive evaluations on benchmark datasets corroborate\nour framework's enhanced robustness and generalization, demonstrating its\nsuperiority over existing state-of-the-art PDA approaches.\n","authors":["Sandipan Choudhuri","Suli Adeniye","Arunabha Sen"],"pdf_url":"https://arxiv.org/pdf/2309.03531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03530v1","updated":"2023-09-07T07:23:55Z","published":"2023-09-07T07:23:55Z","title":"Efficient Single Object Detection on Image Patches with Early Exit\n Enhanced High-Precision CNNs","summary":" This paper proposes a novel approach for detecting objects using mobile\nrobots in the context of the RoboCup Standard Platform League, with a primary\nfocus on detecting the ball. The challenge lies in detecting a dynamic object\nin varying lighting conditions and blurred images caused by fast movements. To\naddress this challenge, the paper presents a convolutional neural network\narchitecture designed specifically for computationally constrained robotic\nplatforms. The proposed CNN is trained to achieve high precision classification\nof single objects in image patches and to determine their precise spatial\npositions. The paper further integrates Early Exits into the existing\nhigh-precision CNN architecture to reduce the computational cost of easily\nrejectable cases in the background class. The training process involves a\ncomposite loss function based on confidence and positional losses with dynamic\nweighting and data augmentation. The proposed approach achieves a precision of\n100% on the validation dataset and a recall of almost 87%, while maintaining an\nexecution time of around 170 $\\mu$s per hypotheses. By combining the proposed\napproach with an Early Exit, a runtime optimization of more than 28%, on\naverage, can be achieved compared to the original CNN. Overall, this paper\nprovides an efficient solution for an enhanced detection of objects, especially\nthe ball, in computationally constrained robotic platforms.\n","authors":["Arne Moos"],"pdf_url":"https://arxiv.org/pdf/2309.03530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03509v1","updated":"2023-09-07T06:45:43Z","published":"2023-09-07T06:45:43Z","title":"BroadCAM: Outcome-agnostic Class Activation Mapping for Small-scale\n Weakly Supervised Applications","summary":" Class activation mapping~(CAM), a visualization technique for interpreting\ndeep learning models, is now commonly used for weakly supervised semantic\nsegmentation~(WSSS) and object localization~(WSOL). It is the weighted\naggregation of the feature maps by activating the high class-relevance ones.\nCurrent CAM methods achieve it relying on the training outcomes, such as\npredicted scores~(forward information), gradients~(backward information), etc.\nHowever, when with small-scale data, unstable training may lead to less\neffective model outcomes and generate unreliable weights, finally resulting in\nincorrect activation and noisy CAM seeds. In this paper, we propose an\noutcome-agnostic CAM approach, called BroadCAM, for small-scale weakly\nsupervised applications. Since broad learning system (BLS) is independent to\nthe model learning, BroadCAM can avoid the weights being affected by the\nunreliable model outcomes when with small-scale data. By evaluating BroadCAM on\nVOC2012 (natural images) and BCSS-WSSS (medical images) for WSSS and\nOpenImages30k for WSOL, BroadCAM demonstrates superior performance than\nexisting CAM methods with small-scale data (less than 5\\%) in different CNN\narchitectures. It also achieves SOTA performance with large-scale training\ndata. Extensive qualitative comparisons are conducted to demonstrate how\nBroadCAM activates the high class-relevance feature maps and generates reliable\nCAMs when with small-scale training data.\n","authors":["Jiatai Lin","Guoqiang Han","Xuemiao Xu","Changhong Liang","Tien-Tsin Wong","C. L. Philip Chen","Zaiyi Liu","Chu Han"],"pdf_url":"https://arxiv.org/pdf/2309.03509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03508v1","updated":"2023-09-07T06:41:15Z","published":"2023-09-07T06:41:15Z","title":"Dynamic Frame Interpolation in Wavelet Domain","summary":" Video frame interpolation is an important low-level vision task, which can\nincrease frame rate for more fluent visual experience. Existing methods have\nachieved great success by employing advanced motion models and synthesis\nnetworks. However, the spatial redundancy when synthesizing the target frame\nhas not been fully explored, that can result in lots of inefficient\ncomputation. On the other hand, the computation compression degree in frame\ninterpolation is highly dependent on both texture distribution and scene\nmotion, which demands to understand the spatial-temporal information of each\ninput frame pair for a better compression degree selection. In this work, we\npropose a novel two-stage frame interpolation framework termed WaveletVFI to\naddress above problems. It first estimates intermediate optical flow with a\nlightweight motion perception network, and then a wavelet synthesis network\nuses flow aligned context features to predict multi-scale wavelet coefficients\nwith sparse convolution for efficient target frame reconstruction, where the\nsparse valid masks that control computation in each scale are determined by a\ncrucial threshold ratio. Instead of setting a fixed value like previous\nmethods, we find that embedding a classifier in the motion perception network\nto learn a dynamic threshold for each sample can achieve more computation\nreduction with almost no loss of accuracy. On the common high resolution and\nanimation frame interpolation benchmarks, proposed WaveletVFI can reduce\ncomputation up to 40% while maintaining similar accuracy, making it perform\nmore efficiently against other state-of-the-arts. Code is available at\nhttps://github.com/ltkong218/WaveletVFI.\n","authors":["Lingtong Kong","Boyuan Jiang","Donghao Luo","Wenqing Chu","Ying Tai","Chengjie Wang","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2309.03508v1.pdf","comment":"Accepted by IEEE TIP"},{"id":"http://arxiv.org/abs/2305.09241v4","updated":"2023-09-07T06:34:37Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n Unexploitable Data with Learnable Examples","summary":" Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v4.pdf","comment":"Accepted in MM 2023"},{"id":"http://arxiv.org/abs/2309.03506v1","updated":"2023-09-07T06:33:30Z","published":"2023-09-07T06:33:30Z","title":"Towards Robust Natural-Looking Mammography Lesion Synthesis on\n Ipsilateral Dual-Views Breast Cancer Analysis","summary":" In recent years, many mammographic image analysis methods have been\nintroduced for improving cancer classification tasks. Two major issues of\nmammogram classification tasks are leveraging multi-view mammographic\ninformation and class-imbalance handling. In the first problem, many multi-view\nmethods have been released for concatenating features of two or more views for\nthe training and inference stage. Having said that, most multi-view existing\nmethods are not explainable in the meaning of feature fusion, and treat many\nviews equally for diagnosing. Our work aims to propose a simple but novel\nmethod for enhancing examined view (main view) by leveraging low-level feature\ninformation from the auxiliary view (ipsilateral view) before learning the\nhigh-level feature that contains the cancerous features. For the second issue,\nwe also propose a simple but novel malignant mammogram synthesis framework for\nupsampling minor class samples. Our easy-to-implement and no-training framework\nhas eliminated the current limitation of the CutMix algorithm which is\nunreliable synthesized images with random pasted patches, hard-contour\nproblems, and domain shift problems. Our results on VinDr-Mammo and CMMD\ndatasets show the effectiveness of our two new frameworks for both multi-view\ntraining and synthesizing mammographic images, outperforming the previous\nconventional methods in our experimental settings.\n","authors":["Thanh-Huy Nguyen","Quang Hien Kha","Thai Ngoc Toan Truong","Ba Thinh Lam","Ba Hung Ngo","Quang Vinh Dinh","Nguyen Quoc Khanh Le"],"pdf_url":"https://arxiv.org/pdf/2309.03506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03504v1","updated":"2023-09-07T06:27:39Z","published":"2023-09-07T06:27:39Z","title":"Stroke-based Neural Painting and Stylization with Dynamically Predicted\n Painting Region","summary":" Stroke-based rendering aims to recreate an image with a set of strokes. Most\nexisting methods render complex images using an uniform-block-dividing\nstrategy, which leads to boundary inconsistency artifacts. To solve the\nproblem, we propose Compositional Neural Painter, a novel stroke-based\nrendering framework which dynamically predicts the next painting region based\non the current canvas, instead of dividing the image plane uniformly into\npainting regions. We start from an empty canvas and divide the painting process\ninto several steps. At each step, a compositor network trained with a phasic RL\nstrategy first predicts the next painting region, then a painter network\ntrained with a WGAN discriminator predicts stroke parameters, and a stroke\nrenderer paints the strokes onto the painting region of the current canvas.\nMoreover, we extend our method to stroke-based style transfer with a novel\ndifferentiable distance transform loss, which helps preserve the structure of\nthe input image during stroke-based stylization. Extensive experiments show our\nmodel outperforms the existing models in both stroke-based neural painting and\nstroke-based stylization. Code is available at\nhttps://github.com/sjtuplayer/Compositional_Neural_Painter\n","authors":["Teng Hu","Ran Yi","Haokun Zhu","Liang Liu","Jinlong Peng","Yabiao Wang","Chengjie Wang","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2309.03504v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.03499v1","updated":"2023-09-07T06:17:31Z","published":"2023-09-07T06:17:31Z","title":"Instance Segmentation of Dislocations in TEM Images","summary":" Quantitative Transmission Electron Microscopy (TEM) during in-situ straining\nexperiment is able to reveal the motion of dislocations -- linear defects in\nthe crystal lattice of metals. In the domain of materials science, the\nknowledge about the location and movement of dislocations is important for\ncreating novel materials with superior properties. A long-standing problem,\nhowever, is to identify the position and extract the shape of dislocations,\nwhich would ultimately help to create a digital twin of such materials. In this\nwork, we quantitatively compare state-of-the-art instance segmentation methods,\nincluding Mask R-CNN and YOLOv8. The dislocation masks as the results of the\ninstance segmentation are converted to mathematical lines, enabling\nquantitative analysis of dislocation length and geometry -- important\ninformation for the domain scientist, which we then propose to include as a\nnovel length-aware quality metric for estimating the network performance. Our\nsegmentation pipeline shows a high accuracy suitable for all domain-specific,\nfurther post-processing. Additionally, our physics-based metric turns out to\nperform much more consistently than typically used pixel-wise metrics.\n","authors":["Karina Ruzaeva","Kishan Govind","Marc Legros","Stefan Sandfeld"],"pdf_url":"https://arxiv.org/pdf/2309.03499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03494v1","updated":"2023-09-07T06:09:12Z","published":"2023-09-07T06:09:12Z","title":"Evaluating Deep Learning-based Melanoma Classification using\n Immunohistochemistry and Routine Histology: A Three Center Study","summary":" Pathologists routinely use immunohistochemical (IHC)-stained tissue slides\nagainst MelanA in addition to hematoxylin and eosin (H&E)-stained slides to\nimprove their accuracy in diagnosing melanomas. The use of diagnostic Deep\nLearning (DL)-based support systems for automated examination of tissue\nmorphology and cellular composition has been well studied in standard\nH&E-stained tissue slides. In contrast, there are few studies that analyze IHC\nslides using DL. Therefore, we investigated the separate and joint performance\nof ResNets trained on MelanA and corresponding H&E-stained slides. The MelanA\nclassifier achieved an area under receiver operating characteristics curve\n(AUROC) of 0.82 and 0.74 on out of distribution (OOD)-datasets, similar to the\nH&E-based benchmark classification of 0.81 and 0.75, respectively. A combined\nclassifier using MelanA and H&E achieved AUROCs of 0.85 and 0.81 on the OOD\ndatasets. DL MelanA-based assistance systems show the same performance as the\nbenchmark H&E classification and may be improved by multi stain classification\nto assist pathologists in their clinical routine.\n","authors":["Christoph Wies","Lucas Schneide","Sarah Haggenmueller","Tabea-Clara Bucher","Sarah Hobelsberger","Markus V. Heppt","Gerardo Ferrara","Eva I. Krieghoff-Henning","Titus J. Brinker"],"pdf_url":"https://arxiv.org/pdf/2309.03494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.01708v2","updated":"2023-09-07T06:07:17Z","published":"2022-08-02T19:38:48Z","title":"Autonomous Agriculture Robot for Smart Farming","summary":" This project aims to develop and demonstrate a ground robot with intelligence\ncapable of conducting semi-autonomous farm operations for different low-heights\nvegetable crops referred as Agriculture Application Robot(AAR). AAR is a\nlightweight, solar-electric powered robot that uses intelligent perception for\nconducting detection and classification of plants and their characteristics.\nThe system also has a robotic arm for the autonomous weed cutting process. The\nrobot can deliver fertilizer spraying, insecticide, herbicide, and other fluids\nto the targets such as crops, weeds, and other pests. Besides, it provides\ninformation for future research into higher-level tasks such as yield\nestimation, crop, and soil health monitoring. We present the design of robot\nand the associated experiments which show the promising results in real world\nenvironments.\n","authors":["Vinay Ummadi","Aravind Gundlapalle","Althaf Shaik","Shaik Mohammad Rafi B"],"pdf_url":"https://arxiv.org/pdf/2208.01708v2.pdf","comment":"Due to author interest conflicts"},{"id":"http://arxiv.org/abs/2309.03493v1","updated":"2023-09-07T06:05:28Z","published":"2023-09-07T06:05:28Z","title":"SAM3D: Segment Anything Model in Volumetric Medical Images","summary":" Image segmentation is a critical task in medical image analysis, providing\nvaluable information that helps to make an accurate diagnosis. In recent years,\ndeep learning-based automatic image segmentation methods have achieved\noutstanding results in medical images. In this paper, inspired by the Segment\nAnything Model (SAM), a foundation model that has received much attention for\nits impressive accuracy and powerful generalization ability in 2D still image\nsegmentation, we propose a SAM3D that targets at 3D volumetric medical images\nand utilizes the pre-trained features from the SAM encoder to capture\nmeaningful representations of input images. Different from other existing\nSAM-based volumetric segmentation methods that perform the segmentation by\ndividing the volume into a set of 2D slices, our model takes the whole 3D\nvolume image as input and processes it simply and effectively that avoids\ntraining a significant number of parameters. Extensive experiments are\nconducted on multiple medical image datasets to demonstrate that our network\nattains competitive results compared with other state-of-the-art methods in 3D\nmedical segmentation tasks while being significantly efficient in terms of\nparameters.\n","authors":["Nhat-Tan Bui","Dinh-Hieu Hoang","Minh-Triet Tran","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2309.03493v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.05893v2","updated":"2023-09-07T05:59:53Z","published":"2022-03-11T12:52:44Z","title":"DRTAM: Dual Rank-1 Tensor Attention Module","summary":" Recently, attention mechanisms have been extensively investigated in computer\nvision, but few of them show excellent performance on both large and mobile\nnetworks. This paper proposes Dual Rank-1 Tensor Attention Module (DRTAM), a\nnovel residual-attention-learning-guided attention module for feed-forward\nconvolutional neural networks. Given a 3D feature tensor map, DRTAM firstly\ngenerates three 2D feature descriptors along three axes. Then, using three\ndescriptors, DRTAM sequentially infers two rank-1 tensor attention maps, the\ninitial attention map and the complement attention map, combines and multiplied\nthem to the input feature map for adaptive feature refinement(see Fig.1(c)). To\ngenerate two attention maps, DRTAM introduces rank-1 tensor attention module\n(RTAM) and residual descriptors extraction module (RDEM): RTAM divides each 2D\nfeature descriptors into several chunks, and generate three factor vectors of a\nrank-1 tensor attention map by employing strip pooling on each chunk so that\nlocal and long-range contextual information can be captured along three\ndimension respectively; RDEM generates three 2D feature descriptors of the\nresidual feature to produce the complement attention map, using three factor\nvectors of the initial attention map and three descriptors of the input\nfeature. Extensive experimental results on ImageNet-1K, MS COCO and PASCAL VOC\ndemonstrate that DRTAM achieves competitive performance on both large and\nmobile networks compare with other state-of-the-art attention modules.\n","authors":["Hanxing Chi","Baihong Lin","Jun Hu","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2203.05893v2.pdf","comment":"There exists some problems on the experiments. Besides, we find that\n the sturcture of DRTAM can be optimized"},{"id":"http://arxiv.org/abs/2304.07803v2","updated":"2023-09-07T05:51:15Z","published":"2023-04-16T15:14:17Z","title":"EGformer: Equirectangular Geometry-biased Transformer for 360 Depth\n Estimation","summary":" Estimating the depths of equirectangular (i.e., 360) images (EIs) is\nchallenging given the distorted 180 x 360 field-of-view, which is hard to be\naddressed via convolutional neural network (CNN). Although a transformer with\nglobal attention achieves significant improvements over CNN for EI depth\nestimation task, it is computationally inefficient, which raises the need for\ntransformer with local attention. However, to apply local attention\nsuccessfully for EIs, a specific strategy, which addresses distorted\nequirectangular geometry and limited receptive field simultaneously, is\nrequired. Prior works have only cared either of them, resulting in\nunsatisfactory depths occasionally. In this paper, we propose an\nequirectangular geometry-biased transformer termed EGformer. While limiting the\ncomputational cost and the number of network parameters, EGformer enables the\nextraction of the equirectangular geometry-aware local attention with a large\nreceptive field. To achieve this, we actively utilize the equirectangular\ngeometry as the bias for the local attention instead of struggling to reduce\nthe distortion of EIs. As compared to the most recent EI depth estimation\nstudies, the proposed approach yields the best depth outcomes overall with the\nlowest computational cost and the fewest parameters, demonstrating the\neffectiveness of the proposed methods.\n","authors":["Ilwi Yun","Chanyong Shin","Hyunku Lee","Hyuk-Jae Lee","Chae Eun Rhee"],"pdf_url":"https://arxiv.org/pdf/2304.07803v2.pdf","comment":"12 pages, Accepted to ICCV23, Camera ready version"},{"id":"http://arxiv.org/abs/2307.14971v2","updated":"2023-09-07T05:44:37Z","published":"2023-07-27T16:07:03Z","title":"Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models","summary":" With the overwhelming trend of mask image modeling led by MAE, generative\npre-training has shown a remarkable potential to boost the performance of\nfundamental models in 2D vision. However, in 3D vision, the over-reliance on\nTransformer-based backbones and the unordered nature of point clouds have\nrestricted the further development of generative pre-training. In this paper,\nwe propose a novel 3D-to-2D generative pre-training method that is adaptable to\nany point cloud model. We propose to generate view images from different\ninstructed poses via the cross-attention mechanism as the pre-training scheme.\nGenerating view images has more precise supervision than its point cloud\ncounterpart, thus assisting 3D backbones to have a finer comprehension of the\ngeometrical structure and stereoscopic relations of the point cloud.\nExperimental results have proved the superiority of our proposed 3D-to-2D\ngenerative pre-training over previous pre-training methods. Our method is also\neffective in boosting the performance of architecture-oriented approaches,\nachieving state-of-the-art performance when fine-tuning on ScanObjectNN\nclassification and ShapeNetPart segmentation tasks. Code is available at\nhttps://github.com/wangzy22/TAP.\n","authors":["Ziyi Wang","Xumin Yu","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2307.14971v2.pdf","comment":"Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz"},{"id":"http://arxiv.org/abs/2309.03483v1","updated":"2023-09-07T05:13:52Z","published":"2023-09-07T05:13:52Z","title":"DetermiNet: A Large-Scale Diagnostic Dataset for Complex\n Visually-Grounded Referencing using Determiners","summary":" State-of-the-art visual grounding models can achieve high detection accuracy,\nbut they are not designed to distinguish between all objects versus only\ncertain objects of interest. In natural language, in order to specify a\nparticular object or set of objects of interest, humans use determiners such as\n\"my\", \"either\" and \"those\". Determiners, as an important word class, are a type\nof schema in natural language about the reference or quantity of the noun.\nExisting grounded referencing datasets place much less emphasis on determiners,\ncompared to other word classes such as nouns, verbs and adjectives. This makes\nit difficult to develop models that understand the full variety and complexity\nof object referencing. Thus, we have developed and released the DetermiNet\ndataset , which comprises 250,000 synthetically generated images and captions\nbased on 25 determiners. The task is to predict bounding boxes to identify\nobjects of interest, constrained by the semantics of the given determiner. We\nfind that current state-of-the-art visual grounding models do not perform well\non the dataset, highlighting the limitations of existing models on reference\nand quantification tasks.\n","authors":["Clarence Lee","M Ganesh Kumar","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2309.03483v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.03477v1","updated":"2023-09-07T04:44:38Z","published":"2023-09-07T04:44:38Z","title":"TSI-Net: A Timing Sequence Image Segmentation Network for Intracranial\n Artery Segmentation in Digital Subtraction Angiography","summary":" Cerebrovascular disease is one of the major diseases facing the world today.\nAutomatic segmentation of intracranial artery (IA) in digital subtraction\nangiography (DSA) sequences is an important step in the diagnosis of vascular\nrelated diseases and in guiding neurointerventional procedures. While, a single\nimage can only show part of the IA within the contrast medium according to the\nimaging principle of DSA technology. Therefore, 2D DSA segmentation methods are\nunable to capture the complete IA information and treatment of cerebrovascular\ndiseases. We propose A timing sequence image segmentation network with U-shape,\ncalled TSI-Net, which incorporates a bi-directional ConvGRU module (BCM) in the\nencoder. The network incorporates a bi-directional ConvGRU module (BCM) in the\nencoder, which can input variable-length DSA sequences, retain past and future\ninformation, segment them into 2D images. In addition, we introduce a sensitive\ndetail branch (SDB) at the end for supervising fine vessels. Experimented on\nthe DSA sequence dataset DIAS, the method performs significantly better than\nstate-of-the-art networks in recent years. In particular, it achieves a Sen\nevaluation metric of 0.797, which is a 3% improvement compared to other\nmethods.\n","authors":["Lemeng Wang","Wentao Liu","Weijin Xu","Haoyuan Li","Huihua Yang","Feng Gao"],"pdf_url":"https://arxiv.org/pdf/2309.03477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00586v2","updated":"2023-09-07T04:29:51Z","published":"2023-07-02T15:05:15Z","title":"ClipSitu: Effectively Leveraging CLIP for Conditional Predictions in\n Situation Recognition","summary":" Situation Recognition is the task of generating a structured summary of what\nis happening in an image using an activity verb and the semantic roles played\nby actors and objects. In this task, the same activity verb can describe a\ndiverse set of situations as well as the same actor or object category can play\na diverse set of semantic roles depending on the situation depicted in the\nimage. Hence a situation recognition model needs to understand the context of\nthe image and the visual-linguistic meaning of semantic roles. Therefore, we\nleverage the CLIP foundational model that has learned the context of images via\nlanguage descriptions. We show that deeper-and-wider multi-layer perceptron\n(MLP) blocks obtain noteworthy results for the situation recognition task by\nusing CLIP image and text embedding features and it even outperforms the\nstate-of-the-art CoFormer, a Transformer-based model, thanks to the external\nimplicit visual-linguistic knowledge encapsulated by CLIP and the expressive\npower of modern MLP block designs. Motivated by this, we design a\ncross-attention-based Transformer using CLIP visual tokens that model the\nrelation between textual roles and visual entities. Our cross-attention-based\nTransformer known as ClipSitu XTF outperforms existing state-of-the-art by a\nlarge margin of 14.1\\% on semantic role labelling (value) for top-1 accuracy\nusing imSitu dataset. {Similarly, our ClipSitu XTF obtains state-of-the-art\nsituation localization performance.} We will make the code publicly available.\n","authors":["Debaditya Roy","Dhruv Verma","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2307.00586v2.pdf","comment":"State-of-the-art results on Grounded Situation Recognition"},{"id":"http://arxiv.org/abs/2309.03473v1","updated":"2023-09-07T04:22:02Z","published":"2023-09-07T04:22:02Z","title":"Temporal Collection and Distribution for Referring Video Object\n Segmentation","summary":" Referring video object segmentation aims to segment a referent throughout a\nvideo sequence according to a natural language expression. It requires aligning\nthe natural language expression with the objects' motions and their dynamic\nassociations at the global video level but segmenting objects at the frame\nlevel. To achieve this goal, we propose to simultaneously maintain a global\nreferent token and a sequence of object queries, where the former is\nresponsible for capturing video-level referent according to the language\nexpression, while the latter serves to better locate and segment objects with\neach frame. Furthermore, to explicitly capture object motions and\nspatial-temporal cross-modal reasoning over objects, we propose a novel\ntemporal collection-distribution mechanism for interacting between the global\nreferent token and object queries. Specifically, the temporal collection\nmechanism collects global information for the referent token from object\nqueries to the temporal motions to the language expression. In turn, the\ntemporal distribution first distributes the referent token to the referent\nsequence across all frames and then performs efficient cross-frame reasoning\nbetween the referent sequence and object queries in every frame. Experimental\nresults show that our method outperforms state-of-the-art methods on all\nbenchmarks consistently and significantly.\n","authors":["Jiajin Tang","Ge Zheng","Sibei Yang"],"pdf_url":"https://arxiv.org/pdf/2309.03473v1.pdf","comment":"Accepted by ICCV 2023; Project page:\n https://toneyaya.github.io/tempcd/"},{"id":"http://arxiv.org/abs/2309.03472v1","updated":"2023-09-07T04:10:30Z","published":"2023-09-07T04:10:30Z","title":"Perceptual Quality Assessment of 360$^\\circ$ Images Based on Generative\n Scanpath Representation","summary":" Despite substantial efforts dedicated to the design of heuristic models for\nomnidirectional (i.e., 360$^\\circ$) image quality assessment (OIQA), a\nconspicuous gap remains due to the lack of consideration for the diversity of\nviewing behaviors that leads to the varying perceptual quality of 360$^\\circ$\nimages. Two critical aspects underline this oversight: the neglect of viewing\nconditions that significantly sway user gaze patterns and the overreliance on a\nsingle viewport sequence from the 360$^\\circ$ image for quality inference. To\naddress these issues, we introduce a unique generative scanpath representation\n(GSR) for effective quality inference of 360$^\\circ$ images, which aggregates\nvaried perceptual experiences of multi-hypothesis users under a predefined\nviewing condition. More specifically, given a viewing condition characterized\nby the starting point of viewing and exploration time, a set of scanpaths\nconsisting of dynamic visual fixations can be produced using an apt scanpath\ngenerator. Following this vein, we use the scanpaths to convert the 360$^\\circ$\nimage into the unique GSR, which provides a global overview of gazed-focused\ncontents derived from scanpaths. As such, the quality inference of the\n360$^\\circ$ image is swiftly transformed to that of GSR. We then propose an\nefficient OIQA computational framework by learning the quality maps of GSR.\nComprehensive experimental results validate that the predictions of the\nproposed framework are highly consistent with human perception in the\nspatiotemporal domain, especially in the challenging context of locally\ndistorted 360$^\\circ$ images under varied viewing conditions. The code will be\nreleased at https://github.com/xiangjieSui/GSR\n","authors":["Xiangjie Sui","Hanwei Zhu","Xuelin Liu","Yuming Fang","Shiqi Wang","Zhou Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03472v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.02719v2","updated":"2023-09-07T03:36:37Z","published":"2023-09-06T05:08:51Z","title":"DMKD: Improving Feature-based Knowledge Distillation for Object\n Detection Via Dual Masking Augmentation","summary":" Recent mainstream masked distillation methods function by reconstructing\nselectively masked areas of a student network from the feature map of its\nteacher counterpart. In these methods, the masked regions need to be properly\nselected, such that reconstructed features encode sufficient discrimination and\nrepresentation capability like the teacher feature. However, previous masked\ndistillation methods only focus on spatial masking, making the resulting masked\nareas biased towards spatial importance without encoding informative channel\nclues. In this study, we devise a Dual Masked Knowledge Distillation (DMKD)\nframework which can capture both spatially important and channel-wise\ninformative clues for comprehensive masked feature reconstruction. More\nspecifically, we employ dual attention mechanism for guiding the respective\nmasking branches, leading to reconstructed feature encoding dual significance.\nFurthermore, fusing the reconstructed features is achieved by self-adjustable\nweighting strategy for effective feature distillation. Our experiments on\nobject detection task demonstrate that the student networks achieve performance\ngains of 4.1% and 4.3% with the help of our method when RetinaNet and Cascade\nMask R-CNN are respectively used as the teacher networks, while outperforming\nthe other state-of-the-art distillation methods.\n","authors":["Guang Yang","Yin Tang","Zhijian Wu","Jun Li","Jianhua Xu","Xili Wan"],"pdf_url":"https://arxiv.org/pdf/2309.02719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03469v1","updated":"2023-09-07T03:34:51Z","published":"2023-09-07T03:34:51Z","title":"Fast FixMatch: Faster Semi-Supervised Learning with Curriculum Batch\n Size","summary":" Advances in Semi-Supervised Learning (SSL) have almost entirely closed the\ngap between SSL and Supervised Learning at a fraction of the number of labels.\nHowever, recent performance improvements have often come \\textit{at the cost of\nsignificantly increased training computation}. To address this, we propose\nCurriculum Batch Size (CBS), \\textit{an unlabeled batch size curriculum which\nexploits the natural training dynamics of deep neural networks.} A small\nunlabeled batch size is used in the beginning of training and is gradually\nincreased to the end of training. A fixed curriculum is used regardless of\ndataset, model or number of epochs, and reduced training computations is\ndemonstrated on all settings. We apply CBS, strong labeled augmentation,\nCurriculum Pseudo Labeling (CPL) \\citep{FlexMatch} to FixMatch \\citep{FixMatch}\nand term the new SSL algorithm Fast FixMatch. We perform an ablation study to\nshow that strong labeled augmentation and/or CPL do not significantly reduce\ntraining computations, but, in synergy with CBS, they achieve optimal\nperformance. Fast FixMatch also achieves substantially higher data utilization\ncompared to previous state-of-the-art. Fast FixMatch achieves between\n$2.1\\times$ - $3.4\\times$ reduced training computations on CIFAR-10 with all\nbut 40, 250 and 4000 labels removed, compared to vanilla FixMatch, while\nattaining the same cited state-of-the-art error rate \\citep{FixMatch}. Similar\nresults are achieved for CIFAR-100, SVHN and STL-10. Finally, Fast MixMatch\nachieves between $2.6\\times$ - $3.3\\times$ reduced training computations in\nfederated SSL tasks and online/streaming learning SSL tasks, which further\ndemonstrate the generializbility of Fast MixMatch to different scenarios and\ntasks.\n","authors":["John Chen","Chen Dun","Anastasios Kyrillidis"],"pdf_url":"https://arxiv.org/pdf/2309.03469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03468v1","updated":"2023-09-07T03:33:49Z","published":"2023-09-07T03:33:49Z","title":"Cross-Image Context Matters for Bongard Problems","summary":" Current machine learning methods struggle to solve Bongard problems, which\nare a type of IQ test that requires deriving an abstract \"concept\" from a set\nof positive and negative \"support\" images, and then classifying whether or not\na new query image depicts the key concept. On Bongard-HOI, a benchmark for\nnatural-image Bongard problems, existing methods have only reached 66% accuracy\n(where chance is 50%). Low accuracy is often attributed to neural nets' lack of\nability to find human-like symbolic rules. In this work, we point out that many\nexisting methods are forfeiting accuracy due to a much simpler problem: they do\nnot incorporate information contained in the support set as a whole, and rely\ninstead on information extracted from individual supports. This is a critical\nissue, because unlike in few-shot learning tasks concerning object\nclassification, the \"key concept\" in a typical Bongard problem can only be\ndistinguished using multiple positives and multiple negatives. We explore a\nvariety of simple methods to take this cross-image context into account, and\ndemonstrate substantial gains over prior methods, leading to new\nstate-of-the-art performance on Bongard-LOGO (75.3%) and Bongard-HOI (72.45%)\nand strong performance on the original Bongard problem set (60.84%).\n","authors":["Nikhil Raghuraman","Adam W. Harley","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2309.03468v1.pdf","comment":"Main paper: 7 pages, Appendix: 10 pages, 30 figures. Code:\n https://github.com/nraghuraman/bongard-context"},{"id":"http://arxiv.org/abs/2309.02855v2","updated":"2023-09-07T03:25:02Z","published":"2023-09-06T09:31:37Z","title":"Bandwidth-efficient Inference for Neural Image Compression","summary":" With neural networks growing deeper and feature maps growing larger, limited\ncommunication bandwidth with external memory (or DRAM) and power constraints\nbecome a bottleneck in implementing network inference on mobile and edge\ndevices. In this paper, we propose an end-to-end differentiable bandwidth\nefficient neural inference method with the activation compressed by neural data\ncompression method. Specifically, we propose a transform-quantization-entropy\ncoding pipeline for activation compression with symmetric exponential Golomb\ncoding and a data-dependent Gaussian entropy model for arithmetic coding.\nOptimized with existing model quantization methods, low-level task of image\ncompression can achieve up to 19x bandwidth reduction with 6.21x energy saving.\n","authors":["Shanzhi Yin","Tongda Xu","Yongsheng Liang","Yuanyuan Wang","Yanghao Li","Yan Wang","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2309.02855v2.pdf","comment":"9 pages, 6 figures, submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.03467v1","updated":"2023-09-07T03:22:59Z","published":"2023-09-07T03:22:59Z","title":"Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree\n Image Generation","summary":" A 360-degree (omni-directional) image provides an all-encompassing spherical\nview of a scene. Recently, there has been an increasing interest in\nsynthesising 360-degree images from conventional narrow field of view (NFoV)\nimages captured by digital cameras and smartphones, for providing immersive\nexperiences in various scenarios such as virtual reality. Yet, existing methods\ntypically fall short in synthesizing intricate visual details or ensure the\ngenerated images align consistently with user-provided prompts. In this study,\nautoregressive omni-aware generative network (AOG-Net) is proposed for\n360-degree image generation by out-painting an incomplete 360-degree image\nprogressively with NFoV and text guidances joinly or individually. This\nautoregressive scheme not only allows for deriving finer-grained and\ntext-consistent patterns by dynamically generating and adjusting the process\nbut also offers users greater flexibility to edit their conditions throughout\nthe generation process. A global-local conditioning mechanism is devised to\ncomprehensively formulate the outpainting guidance in each autoregressive step.\nText guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and\nfurther formulated with cross-attention based transformers into a global stream\nand a local stream into a conditioned generative backbone model. As AOG-Net is\ncompatible to leverage large-scale models for the conditional encoder and the\ngenerative prior, it enables the generation to use extensive open-vocabulary\ntext guidances. Comprehensive experiments on two commonly used 360-degree image\ndatasets for both indoor and outdoor settings demonstrate the state-of-the-art\nperformance of our proposed method. Our code will be made publicly available.\n","authors":["Zhuqiang Lu","Kun Hu","Chaoyue Wang","Lei Bai","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03467v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.14469v2","updated":"2023-09-07T02:54:20Z","published":"2023-08-28T10:15:57Z","title":"Pixel-Aware Stable Diffusion for Realistic Image Super-resolution and\n Personalized Stylization","summary":" Realistic image super-resolution (Real-ISR) aims to reproduce perceptually\nrealistic image details from a low-quality input. The commonly used adversarial\ntraining based Real-ISR methods often introduce unnatural visual artifacts and\nfail to generate realistic textures for natural scene images. The recently\ndeveloped generative stable diffusion models provide a potential solution to\nReal-ISR with pre-learned strong image priors. However, the existing methods\nalong this line either fail to keep faithful pixel-wise image structures or\nresort to extra skipped connections to reproduce details, which requires\nadditional training in image space and limits their extension to other related\ntasks in latent space such as image stylization. In this work, we propose a\npixel-aware stable diffusion (PASD) network to achieve robust Real-ISR as well\nas personalized stylization. In specific, a pixel-aware cross attention module\nis introduced to enable diffusion models perceiving image local structures in\npixel-wise level, while a degradation removal module is used to extract\ndegradation insensitive features to guide the diffusion process together with\nimage high level information. By simply replacing the base diffusion model with\na personalized one, our method can generate diverse stylized images without the\nneed to collect pairwise training data. PASD can be easily integrated into\nexisting diffusion models such as Stable Diffusion. Experiments on Real-ISR and\npersonalized stylization demonstrate the effectiveness of our proposed\napproach. The source code and models can be found at\n\\url{https://github.com/yangxy/PASD}.\n","authors":["Tao Yang","Peiran Ren","Xuansong Xie","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.14469v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00938v2","updated":"2023-09-07T02:30:16Z","published":"2023-09-02T13:32:14Z","title":"Exploring the Robustness of Human Parsers Towards Common Corruptions","summary":" Human parsing aims to segment each pixel of the human image with fine-grained\nsemantic categories. However, current human parsers trained with clean data are\neasily confused by numerous image corruptions such as blur and noise. To\nimprove the robustness of human parsers, in this paper, we construct three\ncorruption robustness benchmarks, termed LIP-C, ATR-C, and\nPascal-Person-Part-C, to assist us in evaluating the risk tolerance of human\nparsing models. Inspired by the data augmentation strategy, we propose a novel\nheterogeneous augmentation-enhanced mechanism to bolster robustness under\ncommonly corrupted conditions. Specifically, two types of data augmentations\nfrom different views, i.e., image-aware augmentation and model-aware\nimage-to-image transformation, are integrated in a sequential manner for\nadapting to unforeseen image corruptions. The image-aware augmentation can\nenrich the high diversity of training images with the help of common image\noperations. The model-aware augmentation strategy that improves the diversity\nof input data by considering the model's randomness. The proposed method is\nmodel-agnostic, and it can plug and play into arbitrary state-of-the-art human\nparsing frameworks. The experimental results show that the proposed method\ndemonstrates good universality which can improve the robustness of the human\nparsing models and even the semantic segmentation models when facing various\nimage common corruptions. Meanwhile, it can still obtain approximate\nperformance on clean data.\n","authors":["Sanyi Zhang","Xiaochun Cao","Rui Wang","Guo-Jun Qi","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.00938v2.pdf","comment":"Accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2309.03453v1","updated":"2023-09-07T02:28:04Z","published":"2023-09-07T02:28:04Z","title":"SyncDreamer: Generating Multiview-consistent Images from a Single-view\n Image","summary":" In this paper, we present a novel diffusion model called that generates\nmultiview-consistent images from a single-view image. Using pretrained\nlarge-scale 2D diffusion models, recent work Zero123 demonstrates the ability\nto generate plausible novel views from a single-view image of an object.\nHowever, maintaining consistency in geometry and colors for the generated\nimages remains a challenge. To address this issue, we propose a synchronized\nmultiview diffusion model that models the joint probability distribution of\nmultiview images, enabling the generation of multiview-consistent images in a\nsingle reverse process. SyncDreamer synchronizes the intermediate states of all\nthe generated images at every step of the reverse process through a 3D-aware\nfeature attention mechanism that correlates the corresponding features across\ndifferent views. Experiments show that SyncDreamer generates images with high\nconsistency across different views, thus making it well-suited for various 3D\ngeneration tasks such as novel-view-synthesis, text-to-3D, and image-to-3D.\n","authors":["Yuan Liu","Cheng Lin","Zijiao Zeng","Xiaoxiao Long","Lingjie Liu","Taku Komura","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03453v1.pdf","comment":"Project page: https://liuyuan-pal.github.io/SyncDreamer/"},{"id":"http://arxiv.org/abs/2309.03452v1","updated":"2023-09-07T02:26:55Z","published":"2023-09-07T02:26:55Z","title":"Multi-Modality Guidance Network For Missing Modality Inference","summary":" Multimodal models have gained significant success in recent years. Standard\nmultimodal approaches often assume unchanged modalities from training stage to\ninference stage. In practice, however, many scenarios fail to satisfy such\nassumptions with missing modalities during inference, leading to limitations on\nwhere multimodal models can be applied. While existing methods mitigate the\nproblem through reconstructing the missing modalities, it increases unnecessary\ncomputational cost, which could be just as critical, especially for large,\ndeployed systems. To solve the problem from both sides, we propose a novel\nguidance network that promotes knowledge sharing during training, taking\nadvantage of the multimodal representations to train better single-modality\nmodels for inference. Real-life experiment in violence detection shows that our\nproposed framework trains single-modality models that significantly outperform\nits traditionally trained counterparts while maintaining the same inference\ncost.\n","authors":["Zhuokai Zhao","Harish Palani","Tianyi Liu","Lena Evans","Ruth Toner"],"pdf_url":"https://arxiv.org/pdf/2309.03452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01409v2","updated":"2023-09-07T02:10:29Z","published":"2023-09-04T07:40:30Z","title":"Implicit Neural Image Stitching With Enhanced and Blended Feature\n Reconstruction","summary":" Existing frameworks for image stitching often provide visually reasonable\nstitchings. However, they suffer from blurry artifacts and disparities in\nillumination, depth level, etc. Although the recent learning-based stitchings\nrelax such disparities, the required methods impose sacrifice of image\nqualities failing to capture high-frequency details for stitched images. To\naddress the problem, we propose a novel approach, implicit Neural Image\nStitching (NIS) that extends arbitrary-scale super-resolution. Our method\nestimates Fourier coefficients of images for quality-enhancing warps. Then, the\nsuggested model blends color mismatches and misalignment in the latent space\nand decodes the features into RGB values of stitched images. Our experiments\nshow that our approach achieves improvement in resolving the low-definition\nimaging of the previous deep image stitching with favorable accelerated\nimage-enhancing methods. Our source code is available at\nhttps://github.com/minshu-kim/NIS.\n","authors":["Minsu Kim","Jaewon Lee","Byeonghun Lee","Sunghoon Im","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01406v2","updated":"2023-09-07T02:01:56Z","published":"2023-09-04T07:26:42Z","title":"Learning Residual Elastic Warps for Image Stitching under Dirichlet\n Boundary Condition","summary":" Trendy suggestions for learning-based elastic warps enable the deep image\nstitchings to align images exposed to large parallax errors. Despite the\nremarkable alignments, the methods struggle with occasional holes or\ndiscontinuity between overlapping and non-overlapping regions of a target image\nas the applied training strategy mostly focuses on overlap region alignment. As\na result, they require additional modules such as seam finder and image\ninpainting for hiding discontinuity and filling holes, respectively. In this\nwork, we suggest Recurrent Elastic Warps (REwarp) that address the problem with\nDirichlet boundary condition and boost performances by residual learning for\nrecurrent misalign correction. Specifically, REwarp predicts a homography and a\nThin-plate Spline (TPS) under the boundary constraint for discontinuity and\nhole-free image stitching. Our experiments show the favorable aligns and the\ncompetitive computational costs of REwarp compared to the existing stitching\nmethods. Our source code is available at https://github.com/minshu-kim/REwarp.\n","authors":["Minsu Kim","Yongjun Lee","Woo Kyoung Han","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03445v1","updated":"2023-09-07T01:58:06Z","published":"2023-09-07T01:58:06Z","title":"Underwater Image Enhancement by Transformer-based Diffusion Model with\n Non-uniform Sampling for Skip Strategy","summary":" In this paper, we present an approach to image enhancement with diffusion\nmodel in underwater scenes. Our method adapts conditional denoising diffusion\nprobabilistic models to generate the corresponding enhanced images by using the\nunderwater images and the Gaussian noise as the inputs. Additionally, in order\nto improve the efficiency of the reverse process in the diffusion model, we\nadopt two different ways. We firstly propose a lightweight transformer-based\ndenoising network, which can effectively promote the time of network forward\nper iteration. On the other hand, we introduce a skip sampling strategy to\nreduce the number of iterations. Besides, based on the skip sampling strategy,\nwe propose two different non-uniform sampling methods for the sequence of the\ntime step, namely piecewise sampling and searching with the evolutionary\nalgorithm. Both of them are effective and can further improve performance by\nusing the same steps against the previous uniform sampling. In the end, we\nconduct a relative evaluation of the widely used underwater enhancement\ndatasets between the recent state-of-the-art methods and the proposed approach.\nThe experimental results prove that our approach can achieve both competitive\nperformance and high efficiency. Our code is available at\n\\href{mailto:https://github.com/piggy2009/DM_underwater}{\\color{blue}{https://github.com/piggy2009/DM\\_underwater}}.\n","authors":["Yi Tang","Takafumi Iwaguchi","Hiroshi Kawasaki"],"pdf_url":"https://arxiv.org/pdf/2309.03445v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14051v2","updated":"2023-09-07T01:47:22Z","published":"2023-02-27T18:59:55Z","title":"Internet Explorer: Targeted Representation Learning on the Open Web","summary":" Modern vision models typically rely on fine-tuning general-purpose models\npre-trained on large, static datasets. These general-purpose models only\ncapture the knowledge within their pre-training datasets, which are tiny,\nout-of-date snapshots of the Internet -- where billions of images are uploaded\neach day. We suggest an alternate approach: rather than hoping our static\ndatasets transfer to our desired tasks after large-scale pre-training, we\npropose dynamically utilizing the Internet to quickly train a small-scale model\nthat does extremely well on the task at hand. Our approach, called Internet\nExplorer, explores the web in a self-supervised manner to progressively find\nrelevant examples that improve performance on a desired target dataset. It\ncycles between searching for images on the Internet with text queries,\nself-supervised training on downloaded images, determining which images were\nuseful, and prioritizing what to search for next. We evaluate Internet Explorer\nacross several datasets and show that it outperforms or matches CLIP oracle\nperformance by using just a single GPU desktop to actively query the Internet\nfor 30--40 hours. Results, visualizations, and videos at\nhttps://internet-explorer-ssl.github.io/\n","authors":["Alexander C. Li","Ellis Brown","Alexei A. Efros","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2302.14051v2.pdf","comment":"In ICML 2023. Website at https://internet-explorer-ssl.github.io/"},{"id":"http://arxiv.org/abs/2309.03440v1","updated":"2023-09-07T01:46:17Z","published":"2023-09-07T01:46:17Z","title":"Punctate White Matter Lesion Segmentation in Preterm Infants Powered by\n Counterfactually Generative Learning","summary":" Accurate segmentation of punctate white matter lesions (PWMLs) are\nfundamental for the timely diagnosis and treatment of related developmental\ndisorders. Automated PWMLs segmentation from infant brain MR images is\nchallenging, considering that the lesions are typically small and low-contrast,\nand the number of lesions may dramatically change across subjects. Existing\nlearning-based methods directly apply general network architectures to this\nchallenging task, which may fail to capture detailed positional information of\nPWMLs, potentially leading to severe under-segmentations. In this paper, we\npropose to leverage the idea of counterfactual reasoning coupled with the\nauxiliary task of brain tissue segmentation to learn fine-grained positional\nand morphological representations of PWMLs for accurate localization and\nsegmentation. A simple and easy-to-implement deep-learning framework (i.e.,\nDeepPWML) is accordingly designed. It combines the lesion counterfactual map\nwith the tissue probability map to train a lightweight PWML segmentation\nnetwork, demonstrating state-of-the-art performance on a real-clinical dataset\nof infant T1w MR images. The code is available at\n\\href{https://github.com/ladderlab-xjtu/DeepPWML}{https://github.com/ladderlab-xjtu/DeepPWML}.\n","authors":["Zehua Ren","Yongheng Sun","Miaomiao Wang","Yuying Feng","Xianjun Li","Chao Jin","Jian Yang","Chunfeng Lian","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03440v1.pdf","comment":"10 pages, 3 figures, Medical Image Computing and Computer Assisted\n Intervention(MICCAI)"},{"id":"http://arxiv.org/abs/2309.02556v2","updated":"2023-09-07T01:29:40Z","published":"2023-09-05T19:45:27Z","title":"Domain Adaptation for Efficiently Fine-tuning Vision Transformer with\n Encrypted Images","summary":" In recent years, deep neural networks (DNNs) trained with transformed data\nhave been applied to various applications such as privacy-preserving learning,\naccess control, and adversarial defenses. However, the use of transformed data\ndecreases the performance of models. Accordingly, in this paper, we propose a\nnovel method for fine-tuning models with transformed images under the use of\nthe vision transformer (ViT). The proposed domain adaptation method does not\ncause the accuracy degradation of models, and it is carried out on the basis of\nthe embedding structure of ViT. In experiments, we confirmed that the proposed\nmethod prevents accuracy degradation even when using encrypted images with the\nCIFAR-10 and CIFAR-100 datasets.\n","authors":["Teru Nagamori","Sayaka Shiota","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2309.02556v2.pdf","comment":"Accepted by APSIPA 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.03773v1","updated":"2023-09-07T15:24:18Z","published":"2023-09-07T15:24:18Z","title":"Extending Transductive Knowledge Graph Embedding Models for Inductive\n Logical Relational Inference","summary":" Many downstream inference tasks for knowledge graphs, such as relation\nprediction, have been handled successfully by knowledge graph embedding\ntechniques in the transductive setting. To address the inductive setting\nwherein new entities are introduced into the knowledge graph at inference time,\nmore recent work opts for models which learn implicit representations of the\nknowledge graph through a complex function of a network's subgraph structure,\noften parametrized by graph neural network architectures. These come at the\ncost of increased parametrization, reduced interpretability and limited\ngeneralization to other downstream inference tasks. In this work, we bridge the\ngap between traditional transductive knowledge graph embedding approaches and\nmore recent inductive relation prediction models by introducing a generalized\nform of harmonic extension which leverages representations learned through\ntransductive embedding methods to infer representations of new entities\nintroduced at inference time as in the inductive setting. This harmonic\nextension technique provides the best such approximation, can be implemented\nvia an efficient iterative scheme, and can be employed to answer a family of\nconjunctive logical queries over the knowledge graph, further expanding the\ncapabilities of transductive embedding methods. In experiments on a number of\nlarge-scale knowledge graph embedding benchmarks, we find that this approach\nfor extending the functionality of transductive knowledge graph embedding\nmodels to perform knowledge graph completion and answer logical queries in the\ninductive setting is competitive with--and in some scenarios\noutperforms--several state-of-the-art models derived explicitly for such\ninductive tasks.\n","authors":["Thomas Gebhart","John Cobb"],"pdf_url":"https://arxiv.org/pdf/2309.03773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13186v2","updated":"2023-09-07T12:30:30Z","published":"2023-06-22T20:03:09Z","title":"A Decade of Scholarly Research on Open Knowledge Graphs","summary":" The proliferation of open knowledge graphs has led to a surge in scholarly\nresearch on the topic over the past decade. This paper presents a bibliometric\nanalysis of the scholarly literature on open knowledge graphs published between\n2013 and 2023. The study aims to identify the trends, patterns, and impact of\nresearch in this field, as well as the key topics and research questions that\nhave emerged. The work uses bibliometric techniques to analyze a sample of 4445\nscholarly articles retrieved from Scopus. The findings reveal an\never-increasing number of publications on open knowledge graphs published every\nyear, particularly in developed countries (+50 per year). These outputs are\npublished in highly-referred scholarly journals and conferences. The study\nidentifies three main research themes: (1) knowledge graph construction and\nenrichment, (2) evaluation and reuse, and (3) fusion of knowledge graphs into\nNLP systems. Within these themes, the study identifies specific tasks that have\nreceived considerable attention, including entity linking, knowledge graph\nembedding, and graph neural networks.\n","authors":["Houcemeddine Turki","Abraham Toluwase Owodunni","Mohamed Ali Hadj Taieb","René Fabrice Bile","Mohamed Ben Aouicha"],"pdf_url":"https://arxiv.org/pdf/2306.13186v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03645v1","updated":"2023-09-07T11:24:47Z","published":"2023-09-07T11:24:47Z","title":"VideolandGPT: A User Study on a Conversational Recommender System","summary":" This paper investigates how large language models (LLMs) can enhance\nrecommender systems, with a specific focus on Conversational Recommender\nSystems that leverage user preferences and personalised candidate selections\nfrom existing ranking models. We introduce VideolandGPT, a recommender system\nfor a Video-on-Demand (VOD) platform, Videoland, which uses ChatGPT to select\nfrom a predetermined set of contents, considering the additional context\nindicated by users' interactions with a chat interface. We evaluate ranking\nmetrics, user experience, and fairness of recommendations, comparing a\npersonalised and a non-personalised version of the system, in a between-subject\nuser study. Our results indicate that the personalised version outperforms the\nnon-personalised in terms of accuracy and general user satisfaction, while both\nversions increase the visibility of items which are not in the top of the\nrecommendation lists. However, both versions present inconsistent behavior in\nterms of fairness, as the system may generate recommendations which are not\navailable on Videoland.\n","authors":["Mateo Gutierrez Granada","Dina Zilbershtein","Daan Odijk","Francesco Barile"],"pdf_url":"https://arxiv.org/pdf/2309.03645v1.pdf","comment":"Preprint for KARS2023 (5th Knowledge-aware and Conversational\n Recommender Systems Workshop at RecSys2023)"},{"id":"http://arxiv.org/abs/2308.13754v2","updated":"2023-09-07T11:22:59Z","published":"2023-08-26T03:48:10Z","title":"ZC3: Zero-Shot Cross-Language Code Clone Detection","summary":" Developers introduce code clones to improve programming productivity. Many\nexisting studies have achieved impressive performance in monolingual code clone\ndetection. However, during software development, more and more developers write\nsemantically equivalent programs with different languages to support different\nplatforms and help developers translate projects from one language to another.\nConsidering that collecting cross-language parallel data, especially for\nlow-resource languages, is expensive and time-consuming, how designing an\neffective cross-language model that does not rely on any parallel data is a\nsignificant problem. In this paper, we propose a novel method named ZC3 for\nZero-shot Cross-language Code Clone detection. ZC3 designs the contrastive\nsnippet prediction to form an isomorphic representation space among different\nprogramming languages. Based on this, ZC3 exploits domain-aware learning and\ncycle consistency learning to further constrain the model to generate\nrepresentations that are aligned among different languages meanwhile are\ndiacritical for different types of clones. To evaluate our approach, we conduct\nextensive experiments on four representative cross-language clone detection\ndatasets. Experimental results show that ZC3 outperforms the state-of-the-art\nbaselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively.\nWe further investigate the representational distribution of different languages\nand discuss the effectiveness of our method.\n","authors":["Jia Li","Chongyang Tao","Zhi Jin","Fang Liu","Jia Li","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2308.13754v2.pdf","comment":"Accepted by the 38th IEEE/ACM International Conference on Automated\n Software Engineering (ASE 2023)"},{"id":"http://arxiv.org/abs/2309.03613v1","updated":"2023-09-07T10:13:09Z","published":"2023-09-07T10:13:09Z","title":"Evaluating ChatGPT as a Recommender System: A Rigorous Approach","summary":" Recent popularity surrounds large AI language models due to their impressive\nnatural language capabilities. They contribute significantly to\nlanguage-related tasks, including prompt-based learning, making them valuable\nfor various specific tasks. This approach unlocks their full potential,\nenhancing precision and generalization. Research communities are actively\nexploring their applications, with ChatGPT receiving recognition. Despite\nextensive research on large language models, their potential in recommendation\nscenarios still needs to be explored. This study aims to fill this gap by\ninvestigating ChatGPT's capabilities as a zero-shot recommender system. Our\ngoals include evaluating its ability to use user preferences for\nrecommendations, reordering existing recommendation lists, leveraging\ninformation from similar users, and handling cold-start situations. We assess\nChatGPT's performance through comprehensive experiments using three datasets\n(MovieLens Small, Last.FM, and Facebook Book). We compare ChatGPT's performance\nagainst standard recommendation algorithms and other large language models,\nsuch as GPT-3.5 and PaLM-2. To measure recommendation effectiveness, we employ\nwidely-used evaluation metrics like Mean Average Precision (MAP), Recall,\nPrecision, F1, normalized Discounted Cumulative Gain (nDCG), Item Coverage,\nExpected Popularity Complement (EPC), Average Coverage of Long Tail (ACLT),\nAverage Recommendation Popularity (ARP), and Popularity-based Ranking-based\nEqual Opportunity (PopREO). Through thoroughly exploring ChatGPT's abilities in\nrecommender systems, our study aims to contribute to the growing body of\nresearch on the versatility and potential applications of large language\nmodels. Our experiment code is available on the GitHub repository:\nhttps://github.com/sisinflab/Recommender-ChatGPT\n","authors":["Dario Di Palma","Giovanni Maria Biancofiore","Vito Walter Anelli","Fedelucio Narducci","Tommaso Di Noia","Eugenio Di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2309.03613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02335v2","updated":"2023-09-07T07:46:16Z","published":"2023-08-04T14:06:44Z","title":"RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph\n Classification","summary":" Graph classification is a crucial task in many real-world multimedia\napplications, where graphs can represent various multimedia data types such as\nimages, videos, and social networks. Previous efforts have applied graph neural\nnetworks (GNNs) in balanced situations where the class distribution is\nbalanced. However, real-world data typically exhibit long-tailed class\ndistributions, resulting in a bias towards the head classes when using GNNs and\nlimited generalization ability over the tail classes. Recent approaches mainly\nfocus on re-balancing different classes during model training, which fails to\nexplicitly introduce new knowledge and sacrifices the performance of the head\nclasses. To address these drawbacks, we propose a novel framework called\nRetrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature\nextractor and an unbiased classifier in a decoupled manner. In the feature\nextractor training stage, we develop a graph retrieval module to search for\nrelevant graphs that directly enrich the intra-class diversity for the tail\nclasses. Moreover, we innovatively optimize a category-centered supervised\ncontrastive loss to obtain discriminative representations, which is more\nsuitable for long-tailed scenarios. In the classifier fine-tuning stage, we\nbalance the classifier weights with two weight regularization techniques, i.e.,\nMax-norm and weight decay. Experiments on various popular benchmarks verify the\nsuperiority of the proposed method against state-of-the-art approaches.\n","authors":["Zhengyang Mao","Wei Ju","Yifang Qin","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02335v2.pdf","comment":"Accepted by the ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2309.03518v1","updated":"2023-09-07T06:58:34Z","published":"2023-09-07T06:58:34Z","title":"Learning Compact Compositional Embeddings via Regularized Pruning for\n Recommendation","summary":" Latent factor models are the dominant backbones of contemporary recommender\nsystems (RSs) given their performance advantages, where a unique vector\nembedding with a fixed dimensionality (e.g., 128) is required to represent each\nentity (commonly a user/item). Due to the large number of users and items on\ne-commerce sites, the embedding table is arguably the least memory-efficient\ncomponent of RSs. For any lightweight recommender that aims to efficiently\nscale with the growing size of users/items or to remain applicable in\nresource-constrained settings, existing solutions either reduce the number of\nembeddings needed via hashing, or sparsify the full embedding table to switch\noff selected embedding dimensions. However, as hash collision arises or\nembeddings become overly sparse, especially when adapting to a tighter memory\nbudget, those lightweight recommenders inevitably have to compromise their\naccuracy. To this end, we propose a novel compact embedding framework for RSs,\nnamely Compositional Embedding with Regularized Pruning (CERP). Specifically,\nCERP represents each entity by combining a pair of embeddings from two\nindependent, substantially smaller meta-embedding tables, which are then\njointly pruned via a learnable element-wise threshold. In addition, we\ninnovatively design a regularized pruning mechanism in CERP, such that the two\nsparsified meta-embedding tables are encouraged to encode information that is\nmutually complementary. Given the compatibility with agnostic latent factor\nmodels, we pair CERP with two popular recommendation models for extensive\nexperiments, where results on two real-world datasets under different memory\nbudgets demonstrate its superiority against state-of-the-art baselines. The\ncodebase of CERP is available in https://github.com/xurong-liang/CERP.\n","authors":["Xurong Liang","Tong Chen","Quoc Viet Hung Nguyen","Jianxin Li","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2309.03518v1.pdf","comment":"Accepted by ICDM '23"},{"id":"http://arxiv.org/abs/2309.03512v1","updated":"2023-09-07T06:46:53Z","published":"2023-09-07T06:46:53Z","title":"Behind Recommender Systems: the Geography of the ACM RecSys Community","summary":" The amount and dissemination rate of media content accessible online is\nnowadays overwhelming. Recommender Systems filter this information into\nmanageable streams or feeds, adapted to our personal needs or preferences. It\nis of utter importance that algorithms employed to filter information do not\ndistort or cut out important elements from our perspectives of the world. Under\nthis principle, it is essential to involve diverse views and teams from the\nearliest stages of their design and development. This has been highlighted, for\ninstance, in recent European Union regulations such as the Digital Services\nAct, via the requirement of risk monitoring, including the risk of\ndiscrimination, and the AI Act, through the requirement to involve people with\ndiverse backgrounds in the development of AI systems. We look into the\ngeographic diversity of the recommender systems research community,\nspecifically by analyzing the affiliation countries of the authors who\ncontributed to the ACM Conference on Recommender Systems (RecSys) during the\nlast 15 years. This study has been carried out in the framework of the\nDiversity in AI - DivinAI project, whose main objective is the long-term\nmonitoring of diversity in AI forums through a set of indexes.\n","authors":["Lorenzo Porcaro","João Vinagre","Pedro Frau","Isabelle Hupont","Emilia Gómez"],"pdf_url":"https://arxiv.org/pdf/2309.03512v1.pdf","comment":"Presented at the 6th FAccTRec Workshop: Responsible Recommendation\n (FAccTRec '23), September 18, 2023, Singapore"},{"id":"http://arxiv.org/abs/2309.03169v2","updated":"2023-09-07T03:13:39Z","published":"2023-09-06T17:09:43Z","title":"Impression-Informed Multi-Behavior Recommender System: A Hierarchical\n Graph Attention Approach","summary":" While recommender systems have significantly benefited from implicit\nfeedback, they have often missed the nuances of multi-behavior interactions\nbetween users and items. Historically, these systems either amalgamated all\nbehaviors, such as \\textit{impression} (formerly \\textit{view}),\n\\textit{add-to-cart}, and \\textit{buy}, under a singular 'interaction' label,\nor prioritized only the target behavior, often the \\textit{buy} action,\ndiscarding valuable auxiliary signals. Although recent advancements tried\naddressing this simplification, they primarily gravitated towards optimizing\nthe target behavior alone, battling with data scarcity. Additionally, they\ntended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these\ngaps, we introduce the \\textbf{H}ierarchical \\textbf{M}ulti-behavior\n\\textbf{G}raph Attention \\textbf{N}etwork (HMGN). This pioneering framework\nleverages attention mechanisms to discern information from both inter and\nintra-behaviors while employing a multi-task Hierarchical Bayesian Personalized\nRanking (HBPR) for optimization. Recognizing the need for scalability, our\napproach integrates a specialized multi-behavior sub-graph sampling technique.\nMoreover, the adaptability of HMGN allows for the seamless inclusion of\nknowledge metadata and time-series data. Empirical results attest to our\nmodel's prowess, registering a notable performance boost of up to 64\\% in\nNDCG@100 metrics over conventional graph neural network methods.\n","authors":["Dong Li","Divya Bhargavi","Vidya Sagar Ravipati"],"pdf_url":"https://arxiv.org/pdf/2309.03169v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.03905v1","updated":"2023-09-07T17:59:45Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v1.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"},{"id":"http://arxiv.org/abs/2309.03893v1","updated":"2023-09-07T17:55:01Z","published":"2023-09-07T17:55:01Z","title":"DiffusionEngine: Diffusion Model is Scalable Data Engine for Object\n Detection","summary":" Data is the cornerstone of deep learning. This paper reveals that the\nrecently developed Diffusion Model is a scalable data engine for object\ndetection. Existing methods for scaling up detection-oriented data often\nrequire manual collection or generative models to obtain target images,\nfollowed by data augmentation and labeling to produce training pairs, which are\ncostly, complex, or lacking diversity. To address these issues, we\npresentDiffusionEngine (DE), a data scaling-up engine that provides\nhigh-quality detection-oriented training pairs in a single stage. DE consists\nof a pre-trained diffusion model and an effective Detection-Adapter,\ncontributing to generating scalable, diverse and generalizable detection data\nin a plug-and-play manner. Detection-Adapter is learned to align the implicit\nsemantic and location knowledge in off-the-shelf diffusion models with\ndetection-aware signals to make better bounding-box predictions. Additionally,\nwe contribute two datasets, i.e., COCO-DE and VOC-DE, to scale up existing\ndetection benchmarks for facilitating follow-up research. Extensive experiments\ndemonstrate that data scaling-up via DE can achieve significant improvements in\ndiverse scenarios, such as various detection algorithms, self-supervised\npre-training, data-sparse, label-scarce, cross-domain, and semi-supervised\nlearning. For example, when using DE with a DINO-based adapter to scale up\ndata, mAP is improved by 3.1% on COCO, 7.6% on VOC, and 11.5% on Clipart.\n","authors":["Manlin Zhang","Jie Wu","Yuxi Ren","Ming Li","Jie Qin","Xuefeng Xiao","Wei Liu","Rui Wang","Min Zheng","Andy J. Ma"],"pdf_url":"https://arxiv.org/pdf/2309.03893v1.pdf","comment":"Code and Models are publicly available. Project Page:\n https://mettyz.github.io/DiffusionEngine"},{"id":"http://arxiv.org/abs/2309.03891v1","updated":"2023-09-07T17:53:20Z","published":"2023-09-07T17:53:20Z","title":"ArtiGrasp: Physically Plausible Synthesis of Bi-Manual Dexterous\n Grasping and Articulation","summary":" We present ArtiGrasp, a novel method to synthesize bi-manual hand-object\ninteractions that include grasping and articulation. This task is challenging\ndue to the diversity of the global wrist motions and the precise finger control\nthat are necessary to articulate objects. ArtiGrasp leverages reinforcement\nlearning and physics simulations to train a policy that controls the global and\nlocal hand pose. Our framework unifies grasping and articulation within a\nsingle policy guided by a single hand pose reference. Moreover, to facilitate\nthe training of the precise finger control required for articulation, we\npresent a learning curriculum with increasing difficulty. It starts with\nsingle-hand manipulation of stationary objects and continues with multi-agent\ntraining including both hands and non-stationary objects. To evaluate our\nmethod, we introduce Dynamic Object Grasping and Articulation, a task that\ninvolves bringing an object into a target articulated pose. This task requires\ngrasping, relocation, and articulation. We show our method's efficacy towards\nthis task. We further demonstrate that our method can generate motions with\nnoisy hand-object pose estimates from an off-the-shelf image-based regressor.\n","authors":["Hui Zhang","Sammy Christen","Zicong Fan","Luocheng Zheng","Jemin Hwangbo","Jie Song","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2309.03891v1.pdf","comment":"Project page: https://eth-ait.github.io/artigrasp/"},{"id":"http://arxiv.org/abs/2308.16898v2","updated":"2023-09-07T17:50:52Z","published":"2023-08-31T17:57:50Z","title":"Transformers as Support Vector Machines","summary":" Since its inception in \"Attention Is All You Need\", transformer architecture\nhas led to revolutionary advancements in NLP. The attention layer within the\ntransformer admits a sequence of input tokens $X$ and makes them interact\nthrough pairwise similarities computed as softmax$(XQK^\\top X^\\top)$, where\n$(K,Q)$ are the trainable key-query parameters. In this work, we establish a\nformal equivalence between the optimization geometry of self-attention and a\nhard-margin SVM problem that separates optimal input tokens from non-optimal\ntokens using linear constraints on the outer-products of token pairs. This\nformalism allows us to characterize the implicit bias of 1-layer transformers\noptimized with gradient descent: (1) Optimizing the attention layer with\nvanishing regularization, parameterized by $(K,Q)$, converges in direction to\nan SVM solution minimizing the nuclear norm of the combined parameter\n$W=KQ^\\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm\nobjective. We characterize this convergence, highlighting that it can occur\ntoward locally-optimal directions rather than global ones. (2) Complementing\nthis, we prove the local/global directional convergence of gradient descent\nunder suitable geometric conditions. Importantly, we show that\nover-parameterization catalyzes global convergence by ensuring the feasibility\nof the SVM problem and by guaranteeing a benign optimization landscape devoid\nof stationary points. (3) While our theory applies primarily to linear\nprediction heads, we propose a more general SVM equivalence that predicts the\nimplicit bias with nonlinear heads. Our findings are applicable to arbitrary\ndatasets and their validity is verified via experiments. We also introduce\nseveral open problems and research directions. We believe these findings\ninspire the interpretation of transformers as a hierarchy of SVMs that\nseparates and selects optimal tokens.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Christos Thrampoulidis","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2308.16898v2.pdf","comment":"minor edits and update global convergence figure"},{"id":"http://arxiv.org/abs/2309.03886v1","updated":"2023-09-07T17:47:26Z","published":"2023-09-07T17:47:26Z","title":"A Function Interpretation Benchmark for Evaluating Interpretability\n Methods","summary":" Labeling neural network submodules with human-legible descriptions is useful\nfor many downstream tasks: such descriptions can surface failures, guide\ninterventions, and perhaps even explain important model behaviors. To date,\nmost mechanistic descriptions of trained networks have involved small models,\nnarrowly delimited phenomena, and large amounts of human labor. Labeling all\nhuman-interpretable sub-computations in models of increasing size and\ncomplexity will almost certainly require tools that can generate and validate\ndescriptions automatically. Recently, techniques that use learned models\nin-the-loop for labeling have begun to gain traction, but methods for\nevaluating their efficacy are limited and ad-hoc. How should we validate and\ncompare open-ended labeling tools? This paper introduces FIND (Function\nINterpretation and Description), a benchmark suite for evaluating the building\nblocks of automated interpretability methods. FIND contains functions that\nresemble components of trained neural networks, and accompanying descriptions\nof the kind we seek to generate. The functions are procedurally constructed\nacross textual and numeric domains, and involve a range of real-world\ncomplexities, including noise, composition, approximation, and bias. We\nevaluate new and existing methods that use language models (LMs) to produce\ncode-based and language descriptions of function behavior. We find that an\noff-the-shelf LM augmented with only black-box access to functions can\nsometimes infer their structure, acting as a scientist by forming hypotheses,\nproposing experiments, and updating descriptions in light of new data. However,\nLM-based descriptions tend to capture global function behavior and miss local\ncorruptions. These results show that FIND will be useful for characterizing the\nperformance of more sophisticated interpretability methods before they are\napplied to real-world models.\n","authors":["Sarah Schwettmann","Tamar Rott Shaham","Joanna Materzynska","Neil Chowdhury","Shuang Li","Jacob Andreas","David Bau","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2309.03886v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.03883v1","updated":"2023-09-07T17:45:31Z","published":"2023-09-07T17:45:31Z","title":"DoLa: Decoding by Contrasting Layers Improves Factuality in Large\n Language Models","summary":" Despite their impressive capabilities, large language models (LLMs) are prone\nto hallucinations, i.e., generating content that deviates from facts seen\nduring pretraining. We propose a simple decoding strategy for reducing\nhallucinations with pretrained LLMs that does not require conditioning on\nretrieved external knowledge nor additional fine-tuning. Our approach obtains\nthe next-token distribution by contrasting the differences in logits obtained\nfrom projecting the later layers versus earlier layers to the vocabulary space,\nexploiting the fact that factual knowledge in an LLMs has generally been shown\nto be localized to particular transformer layers. We find that this Decoding by\nContrasting Layers (DoLa) approach is able to better surface factual knowledge\nand reduce the generation of incorrect facts. DoLa consistently improves the\ntruthfulness across multiple choices tasks and open-ended generation tasks, for\nexample improving the performance of LLaMA family models on TruthfulQA by\n12-17% absolute points, demonstrating its potential in making LLMs reliably\ngenerate truthful facts.\n","authors":["Yung-Sung Chuang","Yujia Xie","Hongyin Luo","Yoon Kim","James Glass","Pengcheng He"],"pdf_url":"https://arxiv.org/pdf/2309.03883v1.pdf","comment":"The source code is available at https://github.com/voidism/DoLa"},{"id":"http://arxiv.org/abs/2309.03879v1","updated":"2023-09-07T17:44:18Z","published":"2023-09-07T17:44:18Z","title":"Better Practices for Domain Adaptation","summary":" Distribution shifts are all too common in real-world applications of machine\nlearning. Domain adaptation (DA) aims to address this by providing various\nframeworks for adapting models to the deployment data without using labels.\nHowever, the domain shift scenario raises a second more subtle challenge: the\ndifficulty of performing hyperparameter optimisation (HPO) for these adaptation\nalgorithms without access to a labelled validation set. The unclear validation\nprotocol for DA has led to bad practices in the literature, such as performing\nHPO using the target test labels when, in real-world scenarios, they are not\navailable. This has resulted in over-optimism about DA research progress\ncompared to reality. In this paper, we analyse the state of DA when using good\nevaluation practice, by benchmarking a suite of candidate validation criteria\nand using them to assess popular adaptation algorithms. We show that there are\nchallenges across all three branches of domain adaptation methodology including\nUnsupervised Domain Adaptation (UDA), Source-Free Domain Adaptation (SFDA), and\nTest Time Adaptation (TTA). While the results show that realistically\nachievable performance is often worse than expected, they also show that using\nproper validation splits is beneficial, as well as showing that some previously\nunexplored validation metrics provide the best options to date. Altogether, our\nimproved practices covering data, training, validation and hyperparameter\noptimisation form a new rigorous pipeline to improve benchmarking, and hence\nresearch progress, within this important field going forward.\n","authors":["Linus Ericsson","Da Li","Timothy M. Hospedales"],"pdf_url":"https://arxiv.org/pdf/2309.03879v1.pdf","comment":"AutoML 2023 (Best paper award)"},{"id":"http://arxiv.org/abs/2111.06781v3","updated":"2023-09-07T17:42:54Z","published":"2021-11-12T15:47:10Z","title":"Q-Learning for MDPs with General Spaces: Convergence and Near Optimality\n via Quantization under Weak Continuity","summary":" Reinforcement learning algorithms often require finiteness of state and\naction spaces in Markov decision processes (MDPs) (also called controlled\nMarkov chains) and various efforts have been made in the literature towards the\napplicability of such algorithms for continuous state and action spaces. In\nthis paper, we show that under very mild regularity conditions (in particular,\ninvolving only weak continuity of the transition kernel of an MDP), Q-learning\nfor standard Borel MDPs via quantization of states and actions (called\nQuantized Q-Learning) converges to a limit, and furthermore this limit\nsatisfies an optimality equation which leads to near optimality with either\nexplicit performance bounds or which are guaranteed to be asymptotically\noptimal. Our approach builds on (i) viewing quantization as a measurement\nkernel and thus a quantized MDP as a partially observed Markov decision process\n(POMDP), (ii) utilizing near optimality and convergence results of Q-learning\nfor POMDPs, and (iii) finally, near-optimality of finite state model\napproximations for MDPs with weakly continuous kernels which we show to\ncorrespond to the fixed point of the constructed POMDP. Thus, our paper\npresents a very general convergence and approximation result for the\napplicability of Q-learning for continuous MDPs.\n","authors":["Ali Devran Kara","Naci Saldi","Serdar Yüksel"],"pdf_url":"https://arxiv.org/pdf/2111.06781v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03876v1","updated":"2023-09-07T17:41:01Z","published":"2023-09-07T17:41:01Z","title":"OpinionGPT: Modelling Explicit Biases in Instruction-Tuned LLMs","summary":" Instruction-tuned Large Language Models (LLMs) have recently showcased\nremarkable ability to generate fitting responses to natural language\ninstructions. However, an open research question concerns the inherent biases\nof trained models and their responses. For instance, if the data used to tune\nan LLM is dominantly written by persons with a specific political bias, we\nmight expect generated answers to share this bias. Current research work seeks\nto de-bias such models, or suppress potentially biased answers. With this\ndemonstration, we take a different view on biases in instruction-tuning: Rather\nthan aiming to suppress them, we aim to make them explicit and transparent. To\nthis end, we present OpinionGPT, a web demo in which users can ask questions\nand select all biases they wish to investigate. The demo will answer this\nquestion using a model fine-tuned on text representing each of the selected\nbiases, allowing side-by-side comparison. To train the underlying model, we\nidentified 11 different biases (political, geographic, gender, age) and derived\nan instruction-tuning corpus in which each answer was written by members of one\nof these demographics. This paper presents OpinionGPT, illustrates how we\ntrained the bias-aware model and showcases the web application (available at\nhttps://opiniongpt.informatik.hu-berlin.de).\n","authors":["Patrick Haller","Ansar Aynetdinov","Alan Akbik"],"pdf_url":"https://arxiv.org/pdf/2309.03876v1.pdf","comment":"6 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2309.03873v1","updated":"2023-09-07T17:33:30Z","published":"2023-09-07T17:33:30Z","title":"A Tutorial on the Non-Asymptotic Theory of System Identification","summary":" This tutorial serves as an introduction to recently developed non-asymptotic\nmethods in the theory of -- mainly linear -- system identification. We\nemphasize tools we deem particularly useful for a range of problems in this\ndomain, such as the covering technique, the Hanson-Wright Inequality and the\nmethod of self-normalized martingales. We then employ these tools to give\nstreamlined proofs of the performance of various least-squares based estimators\nfor identifying the parameters in autoregressive models. We conclude by\nsketching out how the ideas presented herein can be extended to certain\nnonlinear identification problems.\n","authors":["Ingvar Ziemann","Anastasios Tsiamis","Bruce Lee","Yassir Jedra","Nikolai Matni","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2309.03873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13596v3","updated":"2023-09-07T17:24:08Z","published":"2023-06-23T16:35:46Z","title":"Max-Margin Token Selection in Attention Mechanism","summary":" Attention mechanism is a central component of the transformer architecture\nwhich led to the phenomenal success of large language models. However, the\ntheoretical principles underlying the attention mechanism are poorly\nunderstood, especially its nonconvex optimization dynamics. In this work, we\nexplore the seminal softmax-attention model $f(\\boldsymbol{X})=\\langle\n\\boldsymbol{Xv}, \\texttt{softmax}(\\boldsymbol{XWp})\\rangle$, where\n$\\boldsymbol{X}$ is the token sequence and\n$(\\boldsymbol{v},\\boldsymbol{W},\\boldsymbol{p})$ are trainable parameters. We\nprove that running gradient descent on $\\boldsymbol{p}$, or equivalently\n$\\boldsymbol{W}$, converges in direction to a max-margin solution that\nseparates $\\textit{locally-optimal}$ tokens from non-optimal ones. This clearly\nformalizes attention as an optimal token selection mechanism. Remarkably, our\nresults are applicable to general data and precisely characterize\n$\\textit{optimality}$ of tokens in terms of the value embeddings\n$\\boldsymbol{Xv}$ and problem geometry. We also provide a broader\nregularization path analysis that establishes the margin maximizing nature of\nattention even for nonlinear prediction heads. When optimizing $\\boldsymbol{v}$\nand $\\boldsymbol{p}$ simultaneously with logistic loss, we identify conditions\nunder which the regularization paths directionally converge to their respective\nhard-margin SVM solutions where $\\boldsymbol{v}$ separates the input features\nbased on their labels. Interestingly, the SVM formulation of $\\boldsymbol{p}$\nis influenced by the support vector geometry of $\\boldsymbol{v}$. Finally, we\nverify our theoretical findings via numerical experiments and provide insights.\n","authors":["Davoud Ataee Tarzanagh","Yingcong Li","Xuechen Zhang","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2306.13596v3.pdf","comment":"minor edits and update convergence analysis figure"},{"id":"http://arxiv.org/abs/2211.15341v3","updated":"2023-09-07T17:18:52Z","published":"2022-11-24T18:47:30Z","title":"Non-inferiority of Deep Learning Acute Ischemic Stroke Segmentation on\n Non-Contrast CT Compared to Expert Neuroradiologists","summary":" To determine if a convolutional neural network (CNN) deep learning model can\naccurately segment acute ischemic changes on non-contrast CT compared to\nneuroradiologists. Non-contrast CT (NCCT) examinations from 232 acute ischemic\nstroke patients who were enrolled in the DEFUSE 3 trial were included in this\nstudy. Three experienced neuroradiologists independently segmented hypodensity\nthat reflected the ischemic core on each scan. The neuroradiologist with the\nmost experience (expert A) served as the ground truth for deep learning model\ntraining. Two additional neuroradiologists (experts B and C) segmentations were\nused for data testing. The 232 studies were randomly split into training and\ntest sets. The training set was further randomly divided into 5 folds with\ntraining and validation sets. A 3-dimensional CNN architecture was trained and\noptimized to predict the segmentations of expert A from NCCT. The performance\nof the model was assessed using a set of volume, overlap, and distance metrics\nusing non-inferiority thresholds of 20%, 3ml, and 3mm. The optimized model\ntrained on expert A was compared to test experts B and C. We used a one-sided\nWilcoxon signed-rank test to test for the non-inferiority of the model-expert\ncompared to the inter-expert agreement. The final model performance for the\nischemic core segmentation task reached a performance of 0.46+-0.09 Surface\nDice at Tolerance 5mm and 0.47+-0.13 Dice when trained on expert A. Compared to\nthe two test neuroradiologists the model-expert agreement was non-inferior to\nthe inter-expert agreement, p < 0.05. The CNN accurately delineates the\nhypodense ischemic core on NCCT in acute ischemic stroke patients with an\naccuracy comparable to neuroradiologists.\n","authors":["Sophie Ostmeier","Brian Axelrod","Benjamin F. J. Verhaaren","Soren Christensen","Abdelkader Mahammedi","Yongkai Liu","Benjamin Pulli","Li-Jia Li","Greg Zaharchuk","Jeremy J. Heit"],"pdf_url":"https://arxiv.org/pdf/2211.15341v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03851v1","updated":"2023-09-07T17:07:33Z","published":"2023-09-07T17:07:33Z","title":"CenTime: Event-Conditional Modelling of Censoring in Survival Analysis","summary":" Survival analysis is a valuable tool for estimating the time until specific\nevents, such as death or cancer recurrence, based on baseline observations.\nThis is particularly useful in healthcare to prognostically predict clinically\nimportant events based on patient data. However, existing approaches often have\nlimitations; some focus only on ranking patients by survivability, neglecting\nto estimate the actual event time, while others treat the problem as a\nclassification task, ignoring the inherent time-ordered structure of the\nevents. Furthermore, the effective utilization of censored samples - training\ndata points where the exact event time is unknown - is essential for improving\nthe predictive accuracy of the model. In this paper, we introduce CenTime, a\nnovel approach to survival analysis that directly estimates the time to event.\nOur method features an innovative event-conditional censoring mechanism that\nperforms robustly even when uncensored data is scarce. We demonstrate that our\napproach forms a consistent estimator for the event model parameters, even in\nthe absence of uncensored data. Furthermore, CenTime is easily integrated with\ndeep learning models with no restrictions on batch size or the number of\nuncensored samples. We compare our approach with standard survival analysis\nmethods, including the Cox proportional-hazard model and DeepHit. Our results\nindicate that CenTime offers state-of-the-art performance in predicting\ntime-to-death while maintaining comparable ranking performance. Our\nimplementation is publicly available at\nhttps://github.com/ahmedhshahin/CenTime.\n","authors":["Ahmed H. Shahin","An Zhao","Alexander C. Whitehead","Daniel C. Alexander","Joseph Jacob","David Barber"],"pdf_url":"https://arxiv.org/pdf/2309.03851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08081v2","updated":"2023-09-07T17:04:12Z","published":"2023-03-14T17:13:01Z","title":"Explanation Shift: How Did the Distribution Shift Impact the Model?","summary":" As input data distributions evolve, the predictive performance of machine\nlearning models tends to deteriorate. In practice, new input data tend to come\nwithout target labels. Then, state-of-the-art techniques model input data\ndistributions or model prediction distributions and try to understand issues\nregarding the interactions between learned models and shifting distributions.\nWe suggest a novel approach that models how explanation characteristics shift\nwhen affected by distribution shifts. We find that the modeling of explanation\nshifts can be a better indicator for detecting out-of-distribution model\nbehaviour than state-of-the-art techniques. We analyze different types of\ndistribution shifts using synthetic examples and real-world data sets. We\nprovide an algorithmic method that allows us to inspect the interaction between\ndata set features and learned models and compare them to the state-of-the-art.\nWe release our methods in an open-source Python package, as well as the code\nused to reproduce our experiments.\n","authors":["Carlos Mougan","Klaus Broelemann","David Masip","Gjergji Kasneci","Thanassis Thiropanis","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2303.08081v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.12369"},{"id":"http://arxiv.org/abs/2309.03847v1","updated":"2023-09-07T17:02:32Z","published":"2023-09-07T17:02:32Z","title":"Mixtures of Gaussians are Privately Learnable with a Polynomial Number\n of Samples","summary":" We study the problem of estimating mixtures of Gaussians under the constraint\nof differential privacy (DP). Our main result is that $\\tilde{O}(k^2 d^4\n\\log(1/\\delta) / \\alpha^2 \\varepsilon)$ samples are sufficient to estimate a\nmixture of $k$ Gaussians up to total variation distance $\\alpha$ while\nsatisfying $(\\varepsilon, \\delta)$-DP. This is the first finite sample\ncomplexity upper bound for the problem that does not make any structural\nassumptions on the GMMs.\n To solve the problem, we devise a new framework which may be useful for other\ntasks. On a high level, we show that if a class of distributions (such as\nGaussians) is (1) list decodable and (2) admits a \"locally small'' cover\n[BKSW19] with respect to total variation distance, then the class of its\nmixtures is privately learnable. The proof circumvents a known barrier\nindicating that, unlike Gaussians, GMMs do not admit a locally small cover\n[AAL21].\n","authors":["Mohammad Afzali","Hassan Ashtiani","Christopher Liaw"],"pdf_url":"https://arxiv.org/pdf/2309.03847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02539v2","updated":"2023-09-07T16:56:18Z","published":"2023-09-05T19:19:22Z","title":"A Generalized Bandsplit Neural Network for Cinematic Audio Source\n Separation","summary":" Cinematic audio source separation is a relatively new subtask of audio source\nseparation, with the aim of extracting the dialogue stem, the music stem, and\nthe effects stem from their mixture. In this work, we developed a model\ngeneralizing the Bandsplit RNN for any complete or overcomplete partitions of\nthe frequency axis. Psycho-acoustically motivated frequency scales were used to\ninform the band definitions which are now defined with redundancy for more\nreliable feature extraction. A loss function motivated by the signal-to-noise\nratio and the sparsity-promoting property of the 1-norm was proposed. We\nadditionally exploit the information-sharing property of a common-encoder setup\nto reduce computational complexity during both training and inference, improve\nseparation performance for hard-to-generalize classes of sounds, and allow\nflexibility during inference time with easily detachable decoders. Our best\nmodel sets the state of the art on the Divide and Remaster dataset with\nperformance above the ideal ratio mask for the dialogue stem.\n","authors":["Karn N. Watcharasupat","Chih-Wei Wu","Yiwei Ding","Iroro Orife","Aaron J. Hipple","Phillip A. Williams","Scott Kramer","Alexander Lerch","William Wolcott"],"pdf_url":"https://arxiv.org/pdf/2309.02539v2.pdf","comment":"Submitted to ICASSP-OJSP 2024"},{"id":"http://arxiv.org/abs/2309.03843v1","updated":"2023-09-07T16:55:50Z","published":"2023-09-07T16:55:50Z","title":"Gradient-Based Feature Learning under Structured Data","summary":" Recent works have demonstrated that the sample complexity of gradient-based\nlearning of single index models, i.e. functions that depend on a 1-dimensional\nprojection of the input data, is governed by their information exponent.\nHowever, these results are only concerned with isotropic data, while in\npractice the input often contains additional structure which can implicitly\nguide the algorithm. In this work, we investigate the effect of a spiked\ncovariance structure and reveal several interesting phenomena. First, we show\nthat in the anisotropic setting, the commonly used spherical gradient dynamics\nmay fail to recover the true direction, even when the spike is perfectly\naligned with the target direction. Next, we show that appropriate weight\nnormalization that is reminiscent of batch normalization can alleviate this\nissue. Further, by exploiting the alignment between the (spiked) input\ncovariance and the target, we obtain improved sample complexity compared to the\nisotropic case. In particular, under the spiked model with a suitably large\nspike, the sample complexity of gradient-based training can be made independent\nof the information exponent while also outperforming lower bounds for\nrotationally invariant kernel methods.\n","authors":["Alireza Mousavi-Hosseini","Denny Wu","Taiji Suzuki","Murat A. Erdogdu"],"pdf_url":"https://arxiv.org/pdf/2309.03843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03842v1","updated":"2023-09-07T16:55:33Z","published":"2023-09-07T16:55:33Z","title":"Early warning via transitions in latent stochastic dynamical systems","summary":" Early warnings for dynamical transitions in complex systems or\nhigh-dimensional observation data are essential in many real world\napplications, such as gene mutation, brain diseases, natural disasters,\nfinancial crises, and engineering reliability. To effectively extract early\nwarning signals, we develop a novel approach: the directed anisotropic\ndiffusion map that captures the latent evolutionary dynamics in low-dimensional\nmanifold. Applying the methodology to authentic electroencephalogram (EEG)\ndata, we successfully find the appropriate effective coordinates, and derive\nearly warning signals capable of detecting the tipping point during the state\ntransition. Our method bridges the latent dynamics with the original dataset.\nThe framework is validated to be accurate and effective through numerical\nexperiments, in terms of density and transition probability. It is shown that\nthe second coordinate holds meaningful information for critical transition in\nvarious evaluation metrics.\n","authors":["Lingyu Feng","Ting Gao","Wang Xiao","Jinqiao Duan"],"pdf_url":"https://arxiv.org/pdf/2309.03842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03839v1","updated":"2023-09-07T16:52:27Z","published":"2023-09-07T16:52:27Z","title":"Bootstrapping Adaptive Human-Machine Interfaces with Offline\n Reinforcement Learning","summary":" Adaptive interfaces can help users perform sequential decision-making tasks\nlike robotic teleoperation given noisy, high-dimensional command signals (e.g.,\nfrom a brain-computer interface). Recent advances in human-in-the-loop machine\nlearning enable such systems to improve by interacting with users, but tend to\nbe limited by the amount of data that they can collect from individual users in\npractice. In this paper, we propose a reinforcement learning algorithm to\naddress this by training an interface to map raw command signals to actions\nusing a combination of offline pre-training and online fine-tuning. To address\nthe challenges posed by noisy command signals and sparse rewards, we develop a\nnovel method for representing and inferring the user's long-term intent for a\ngiven trajectory. We primarily evaluate our method's ability to assist users\nwho can only communicate through noisy, high-dimensional input channels through\na user study in which 12 participants performed a simulated navigation task by\nusing their eye gaze to modulate a 128-dimensional command signal from their\nwebcam. The results show that our method enables successful goal navigation\nmore often than a baseline directional interface, by learning to denoise user\ncommands signals and provide shared autonomy assistance. We further evaluate on\na simulated Sawyer pushing task with eye gaze control, and the Lunar Lander\ngame with simulated user commands, and find that our method improves over\nbaseline interfaces in these domains as well. Extensive ablation experiments\nwith simulated user commands empirically motivate each component of our method.\n","authors":["Jensen Gao","Siddharth Reddy","Glen Berseth","Anca D. Dragan","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2309.03839v1.pdf","comment":"Accepted to IEEE/RSJ International Conference on Intelligent Robots\n and Systems (IROS) 2023"},{"id":"http://arxiv.org/abs/2309.03837v1","updated":"2023-09-07T16:50:40Z","published":"2023-09-07T16:50:40Z","title":"Cross-Task Attention Network: Improving Multi-Task Learning for Medical\n Imaging Applications","summary":" Multi-task learning (MTL) is a powerful approach in deep learning that\nleverages the information from multiple tasks during training to improve model\nperformance. In medical imaging, MTL has shown great potential to solve various\ntasks. However, existing MTL architectures in medical imaging are limited in\nsharing information across tasks, reducing the potential performance\nimprovements of MTL. In this study, we introduce a novel attention-based MTL\nframework to better leverage inter-task interactions for various tasks from\npixel-level to image-level predictions. Specifically, we propose a Cross-Task\nAttention Network (CTAN) which utilizes cross-task attention mechanisms to\nincorporate information by interacting across tasks. We validated CTAN on four\nmedical imaging datasets that span different domains and tasks including:\nradiation treatment planning prediction using planning CT images of two\ndifferent target cancers (Prostate, OpenKBP); pigmented skin lesion\nsegmentation and diagnosis using dermatoscopic images (HAM10000); and COVID-19\ndiagnosis and severity prediction using chest CT scans (STOIC). Our study\ndemonstrates the effectiveness of CTAN in improving the accuracy of medical\nimaging tasks. Compared to standard single-task learning (STL), CTAN\ndemonstrated a 4.67% improvement in performance and outperformed both widely\nused MTL baselines: hard parameter sharing (HPS) with an average performance\nimprovement of 3.22%; and multi-task attention network (MTAN) with a relative\ndecrease of 5.38%. These findings highlight the significance of our proposed\nMTL framework in solving medical imaging tasks and its potential to improve\ntheir accuracy across domains.\n","authors":["Sangwook Kim","Thomas G. Purdie","Chris McIntosh"],"pdf_url":"https://arxiv.org/pdf/2309.03837v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.03835v1","updated":"2023-09-07T16:49:38Z","published":"2023-09-07T16:49:38Z","title":"Learning from Demonstration via Probabilistic Diagrammatic Teaching","summary":" Learning for Demonstration (LfD) enables robots to acquire new skills by\nimitating expert demonstrations, allowing users to communicate their\ninstructions in an intuitive manner. Recent progress in LfD often relies on\nkinesthetic teaching or teleoperation as the medium for users to specify the\ndemonstrations. Kinesthetic teaching requires physical handling of the robot,\nwhile teleoperation demands proficiency with additional hardware. This paper\nintroduces an alternative paradigm for LfD called Diagrammatic Teaching.\nDiagrammatic Teaching aims to teach robots novel skills by prompting the user\nto sketch out demonstration trajectories on 2D images of the scene, these are\nthen synthesised as a generative model of motion trajectories in 3D task space.\nAdditionally, we present the Ray-tracing Probabilistic Trajectory Learning\n(RPTL) framework for Diagrammatic Teaching. RPTL extracts time-varying\nprobability densities from the 2D sketches, applies ray-tracing to find\ncorresponding regions in 3D Cartesian space, and fits a probabilistic model of\nmotion trajectories to these regions. New motion trajectories, which mimic\nthose sketched by the user, can then be generated from the probabilistic model.\nWe empirically validate our framework both in simulation and on real robots,\nwhich include a fixed-base manipulator and a quadruped-mounted manipulator.\n","authors":["Weiming Zhi","Tianyi Zhang","Matthew Johnson-Roberson"],"pdf_url":"https://arxiv.org/pdf/2309.03835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.04151v2","updated":"2023-09-07T16:49:26Z","published":"2022-05-09T09:56:58Z","title":"Auto-SDE: Learning effective reduced dynamics from data-driven\n stochastic dynamical systems","summary":" Multiscale stochastic dynamical systems have been widely adopted to\nscientific and engineering problems due to their capability of depicting\ncomplex phenomena in many real world applications. This work is devoted to\ninvestigating the effective reduced dynamics for a slow-fast stochastic\ndynamical system. Given observation data on a short-term period satisfying some\nunknown slow-fast stochastic system, we propose a novel algorithm including a\nneural network called Auto-SDE to learn invariant slow manifold. Our approach\ncaptures the evolutionary nature of a series of time-dependent autoencoder\nneural networks with the loss constructed from a discretized stochastic\ndifferential equation. Our algorithm is also proved to be accurate, stable and\neffective through numerical experiments under various evaluation metrics.\n","authors":["Lingyu Feng","Ting Gao","Min Dai","Jinqiao Duan"],"pdf_url":"https://arxiv.org/pdf/2205.04151v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.09096v5","updated":"2023-09-07T16:46:34Z","published":"2022-03-17T05:42:00Z","title":"DeepAD: A Robust Deep Learning Model of Alzheimer's Disease Progression\n for Real-World Clinical Applications","summary":" The ability to predict the future trajectory of a patient is a key step\ntoward the development of therapeutics for complex diseases such as Alzheimer's\ndisease (AD). However, most machine learning approaches developed for\nprediction of disease progression are either single-task or single-modality\nmodels, which can not be directly adopted to our setting involving multi-task\nlearning with high dimensional images. Moreover, most of those approaches are\ntrained on a single dataset (i.e. cohort), which can not be generalized to\nother cohorts. We propose a novel multimodal multi-task deep learning model to\npredict AD progression by analyzing longitudinal clinical and neuroimaging data\nfrom multiple cohorts. Our proposed model integrates high dimensional MRI\nfeatures from a 3D convolutional neural network with other data modalities,\nincluding clinical and demographic information, to predict the future\ntrajectory of patients. Our model employs an adversarial loss to alleviate the\nstudy-specific imaging bias, in particular the inter-study domain shifts. In\naddition, a Sharpness-Aware Minimization (SAM) optimization technique is\napplied to further improve model generalization. The proposed model is trained\nand tested on various datasets in order to evaluate and validate the results.\nOur results showed that 1) our model yields significant improvement over the\nbaseline models, and 2) models using extracted neuroimaging features from 3D\nconvolutional neural network outperform the same models when applied to\nMRI-derived volumetric features.\n","authors":["Somaye Hashemifar","Claudia Iriondo","Evan Casey","Mohsen Hejrati","for Alzheimer's Disease Neuroimaging Initiative"],"pdf_url":"https://arxiv.org/pdf/2203.09096v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03831v1","updated":"2023-09-07T16:45:42Z","published":"2023-09-07T16:45:42Z","title":"Uncovering Drift in Textual Data: An Unsupervised Method for Detecting\n and Mitigating Drift in Machine Learning Models","summary":" Drift in machine learning refers to the phenomenon where the statistical\nproperties of data or context, in which the model operates, change over time\nleading to a decrease in its performance. Therefore, maintaining a constant\nmonitoring process for machine learning model performance is crucial in order\nto proactively prevent any potential performance regression. However,\nsupervised drift detection methods require human annotation and consequently\nlead to a longer time to detect and mitigate the drift. In our proposed\nunsupervised drift detection method, we follow a two step process. Our first\nstep involves encoding a sample of production data as the target distribution,\nand the model training data as the reference distribution. In the second step,\nwe employ a kernel-based statistical test that utilizes the maximum mean\ndiscrepancy (MMD) distance metric to compare the reference and target\ndistributions and estimate any potential drift. Our method also identifies the\nsubset of production data that is the root cause of the drift. The models\nretrained using these identified high drift samples show improved performance\non online customer experience quality metrics.\n","authors":["Saeed Khaki","Akhouri Abhinav Aditya","Zohar Karnin","Lan Ma","Olivia Pan","Samarth Marudheri Chandrashekar"],"pdf_url":"https://arxiv.org/pdf/2309.03831v1.pdf","comment":"8 pages, Accepted in 2023 Amazon Internal Machine Learning Conference"},{"id":"http://arxiv.org/abs/2309.03827v1","updated":"2023-09-07T16:40:49Z","published":"2023-09-07T16:40:49Z","title":"ArtHDR-Net: Perceptually Realistic and Accurate HDR Content Creation","summary":" High Dynamic Range (HDR) content creation has become an important topic for\nmodern media and entertainment sectors, gaming and Augmented/Virtual Reality\nindustries. Many methods have been proposed to recreate the HDR counterparts of\ninput Low Dynamic Range (LDR) images/videos given a single exposure or\nmulti-exposure LDRs. The state-of-the-art methods focus primarily on the\npreservation of the reconstruction's structural similarity and the pixel-wise\naccuracy. However, these conventional approaches do not emphasize preserving\nthe artistic intent of the images in terms of human visual perception, which is\nan essential element in media, entertainment and gaming. In this paper, we\nattempt to study and fill this gap. We propose an architecture called\nArtHDR-Net based on a Convolutional Neural Network that uses multi-exposed LDR\nfeatures as input. Experimental results show that ArtHDR-Net can achieve\nstate-of-the-art performance in terms of the HDR-VDP-2 score (i.e., mean\nopinion score index) while reaching competitive performance in terms of PSNR\nand SSIM.\n","authors":["Hrishav Bakul Barua","Ganesh Krishnasamy","KokSheik Wong","Kalin Stefanov","Abhinav Dhall"],"pdf_url":"https://arxiv.org/pdf/2309.03827v1.pdf","comment":"Accepted in Asia Pacific Signal and Information Processing\n Association Annual Summit and Conference (APSIPA ASC), Taipei, Taiwan"},{"id":"http://arxiv.org/abs/2309.03825v1","updated":"2023-09-07T16:34:30Z","published":"2023-09-07T16:34:30Z","title":"Prime and Modulate Learning: Generation of forward models with signed\n back-propagation and environmental cues","summary":" Deep neural networks employing error back-propagation for learning can suffer\nfrom exploding and vanishing gradient problems. Numerous solutions have been\nproposed such as normalisation techniques or limiting activation functions to\nlinear rectifying units. In this work we follow a different approach which is\nparticularly applicable to closed-loop learning of forward models where\nback-propagation makes exclusive use of the sign of the error signal to prime\nthe learning, whilst a global relevance signal modulates the rate of learning.\nThis is inspired by the interaction between local plasticity and a global\nneuromodulation. For example, whilst driving on an empty road, one can allow\nfor slow step-wise optimisation of actions, whereas, at a busy junction, an\nerror must be corrected at once. Hence, the error is the priming signal and the\nintensity of the experience is a modulating factor in the weight change. The\nadvantages of this Prime and Modulate paradigm is twofold: it is free from\nnormalisation and it makes use of relevant cues from the environment to enrich\nthe learning. We present a mathematical derivation of the learning rule in\nz-space and demonstrate the real-time performance with a robotic platform. The\nresults show a significant improvement in the speed of convergence compared to\nthat of the conventional back-propagation.\n","authors":["Sama Daryanavard","Bernd Porr"],"pdf_url":"https://arxiv.org/pdf/2309.03825v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2209.13008v4","updated":"2023-09-07T16:34:17Z","published":"2022-09-26T20:40:02Z","title":"USE-Evaluator: Performance Metrics for Medical Image Segmentation Models\n with Uncertain, Small or Empty Reference Annotations","summary":" Performance metrics for medical image segmentation models are used to measure\nthe agreement between the reference annotation and the predicted segmentation.\nUsually, overlap metrics, such as the Dice, are used as a metric to evaluate\nthe performance of these models in order for results to be comparable. However,\nthere is a mismatch between the distributions of cases and difficulty level of\nsegmentation tasks in public data sets compared to clinical practice. Common\nmetrics fail to measure the impact of this mismatch, especially for clinical\ndata sets that include low signal pathologies, a difficult segmentation task,\nand uncertain, small, or empty reference annotations. This limitation may\nresult in ineffective research of machine learning practitioners in designing\nand optimizing models. Dimensions of evaluating clinical value include\nconsideration of the uncertainty of reference annotations, independence from\nreference annotation volume size, and evaluation of classification of empty\nreference annotations. We study how uncertain, small, and empty reference\nannotations influence the value of metrics for medical image segmentation on an\nin-house data set regardless of the model. We examine metrics behavior on the\npredictions of a standard deep learning framework in order to identify metrics\nwith clinical value. We compare to a public benchmark data set (BraTS 2019)\nwith a high-signal pathology and certain, larger, and no empty reference\nannotations. We may show machine learning practitioners, how uncertain, small,\nor empty reference annotations require a rethinking of the evaluation and\noptimizing procedures. The evaluation code was released to encourage further\nanalysis of this topic.\nhttps://github.com/SophieOstmeier/UncertainSmallEmpty.git\n","authors":["Sophie Ostmeier","Brian Axelrod","Jeroen Bertels","Fabian Isensee","Maarten G. Lansberg","Soren Christensen","Gregory W. Albers","Li-Jia Li","Jeremy J. Heit"],"pdf_url":"https://arxiv.org/pdf/2209.13008v4.pdf","comment":"16 pages, 10 figures, Published in Medical Image Analysis"},{"id":"http://arxiv.org/abs/2308.09199v2","updated":"2023-09-07T16:33:48Z","published":"2023-08-17T22:26:48Z","title":"Polynomial Bounds for Learning Noisy Optical Physical Unclonable\n Functions and Connections to Learning With Errors","summary":" It is shown that a class of optical physical unclonable functions (PUFs) can\nbe learned to arbitrary precision with arbitrarily high probability, even in\nthe presence of noise, given access to polynomially many challenge-response\npairs and polynomially bounded computational power, under mild assumptions\nabout the distributions of the noise and challenge vectors. This extends the\nresults of Rh\\\"uramir et al. (2013), who showed a subset of this class of PUFs\nto be learnable in polynomial time in the absence of noise, under the\nassumption that the optics of the PUF were either linear or had negligible\nnonlinear effects. We derive polynomial bounds for the required number of\nsamples and the computational complexity of a linear regression algorithm,\nbased on size parameters of the PUF, the distributions of the challenge and\nnoise vectors, and the probability and accuracy of the regression algorithm,\nwith a similar analysis to one done by Bootle et al. (2018), who demonstrated a\nlearning attack on a poorly implemented version of the Learning With Errors\nproblem.\n","authors":["Apollo Albright","Boris Gelfand","Michael Dixon"],"pdf_url":"https://arxiv.org/pdf/2308.09199v2.pdf","comment":"10 pages, 2 figures, submitted to IEEE Transactions on Information\n Forensics and Security"},{"id":"http://arxiv.org/abs/2309.03824v1","updated":"2023-09-07T16:33:42Z","published":"2023-09-07T16:33:42Z","title":"Training Acceleration of Low-Rank Decomposed Networks using Sequential\n Freezing and Rank Quantization","summary":" Low Rank Decomposition (LRD) is a model compression technique applied to the\nweight tensors of deep learning models in order to reduce the number of\ntrainable parameters and computational complexity. However, due to high number\nof new layers added to the architecture after applying LRD, it may not lead to\na high training/inference acceleration if the decomposition ranks are not small\nenough. The issue is that using small ranks increases the risk of significant\naccuracy drop after decomposition. In this paper, we propose two techniques for\naccelerating low rank decomposed models without requiring to use small ranks\nfor decomposition. These methods include rank optimization and sequential\nfreezing of decomposed layers. We perform experiments on both convolutional and\ntransformer-based models. Experiments show that these techniques can improve\nthe model throughput up to 60% during training and 37% during inference when\ncombined together while preserving the accuracy close to that of the original\nmodels\n","authors":["Habib Hajimolahoseini","Walid Ahmed","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.03824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08719v2","updated":"2023-09-07T16:27:25Z","published":"2023-06-14T19:48:30Z","title":"Off-policy Evaluation in Doubly Inhomogeneous Environments","summary":" This work aims to study off-policy evaluation (OPE) under scenarios where two\nkey reinforcement learning (RL) assumptions -- temporal stationarity and\nindividual homogeneity are both violated. To handle the ``double\ninhomogeneities\", we propose a class of latent factor models for the reward and\nobservation transition functions, under which we develop a general OPE\nframework that consists of both model-based and model-free approaches. To our\nknowledge, this is the first paper that develops statistically sound OPE\nmethods in offline RL with double inhomogeneities. It contributes to a deeper\nunderstanding of OPE in environments, where standard RL assumptions are not\nmet, and provides several practical approaches in these settings. We establish\nthe theoretical properties of the proposed value estimators and empirically\nshow that our approach outperforms competing methods that ignore either\ntemporal nonstationarity or individual heterogeneity. Finally, we illustrate\nour method on a data set from the Medical Information Mart for Intensive Care.\n","authors":["Zeyu Bian","Chengchun Shi","Zhengling Qi","Lan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.08719v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10226v5","updated":"2023-09-07T16:16:10Z","published":"2023-04-20T11:40:21Z","title":"Domain Generalization for Mammographic Image Analysis with Contrastive\n Learning","summary":" The deep learning technique has been shown to be effectively addressed\nseveral image analysis tasks in the computer-aided diagnosis scheme for\nmammography. The training of an efficacious deep learning model requires large\ndata with diverse styles and qualities. The diversity of data often comes from\nthe use of various scanners of vendors. But, in practice, it is impractical to\ncollect a sufficient amount of diverse data for training. To this end, a novel\ncontrastive learning is developed to equip the deep learning models with better\nstyle generalization capability. Specifically, the multi-style and multi-view\nunsupervised self-learning scheme is carried out to seek robust feature\nembedding against style diversity as a pretrained model. Afterward, the\npretrained network is further fine-tuned to the downstream tasks, e.g., mass\ndetection, matching, BI-RADS rating, and breast density classification. The\nproposed method has been evaluated extensively and rigorously with mammograms\nfrom various vendor style domains and several public datasets. The experimental\nresults suggest that the proposed domain generalization method can effectively\nimprove performance of four mammographic image tasks on the data from both seen\nand unseen domains, and outperform many state-of-the-art (SOTA) generalization\nmethods.\n","authors":["Zheren Li","Zhiming Cui","Lichi Zhang","Sheng Wang","Chenjin Lei","Xi Ouyang","Dongdong Chen","Xiangyu Zhao","Yajia Gu","Zaiyi Liu","Chunling Liu","Dinggang Shen","Jie-Zhi Cheng"],"pdf_url":"https://arxiv.org/pdf/2304.10226v5.pdf","comment":"arXiv admin note: text overlap with arXiv:2111.10827"},{"id":"http://arxiv.org/abs/2309.03818v1","updated":"2023-09-07T16:14:00Z","published":"2023-09-07T16:14:00Z","title":"Empirical Risk Minimization for Losses without Variance","summary":" This paper considers an empirical risk minimization problem under\nheavy-tailed settings, where data does not have finite variance, but only has\n$p$-th moment with $p \\in (1,2)$. Instead of using estimation procedure based\non truncated observed data, we choose the optimizer by minimizing the risk\nvalue. Those risk values can be robustly estimated via using the remarkable\nCatoni's method (Catoni, 2012). Thanks to the structure of Catoni-type\ninfluence functions, we are able to establish excess risk upper bounds via\nusing generalized generic chaining methods. Moreover, we take computational\nissues into consideration. We especially theoretically investigate two types of\noptimization methods, robust gradient descent algorithm and empirical\nrisk-based methods. With an extensive numerical study, we find that the\noptimizer based on empirical risks via Catoni-style estimation indeed shows\nbetter performance than other baselines. It indicates that estimation directly\nbased on truncated data may lead to unsatisfactory results.\n","authors":["Guanhua Fang","Ping Li","Gennady Samorodnitsky"],"pdf_url":"https://arxiv.org/pdf/2309.03818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.08901v3","updated":"2023-09-07T16:12:17Z","published":"2022-09-19T10:19:06Z","title":"Global Optimization for Cardinality-constrained Minimum Sum-of-Squares\n Clustering via Semidefinite Programming","summary":" The minimum sum-of-squares clustering (MSSC), or k-means type clustering, has\nbeen recently extended to exploit prior knowledge on the cardinality of each\ncluster. Such knowledge is used to increase performance as well as solution\nquality. In this paper, we propose a global optimization approach based on the\nbranch-and-cut technique to solve the cardinality-constrained MSSC. For the\nlower bound routine, we use the semidefinite programming (SDP) relaxation\nrecently proposed by Rujeerapaiboon et al. [SIAM J. Optim. 29(2), 1211-1239,\n(2019)]. However, this relaxation can be used in a branch-and-cut method only\nfor small-size instances. Therefore, we derive a new SDP relaxation that scales\nbetter with the instance size and the number of clusters. In both cases, we\nstrengthen the bound by adding polyhedral cuts. Benefiting from a tailored\nbranching strategy which enforces pairwise constraints, we reduce the\ncomplexity of the problems arising in the children nodes. For the upper bound,\ninstead, we present a local search procedure that exploits the solution of the\nSDP relaxation solved at each node. Computational results show that the\nproposed algorithm globally solves, for the first time, real-world instances of\nsize 10 times larger than those solved by state-of-the-art exact methods.\n","authors":["Veronica Piccialli","Antonio M. Sudoso"],"pdf_url":"https://arxiv.org/pdf/2209.08901v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03812v1","updated":"2023-09-07T16:09:06Z","published":"2023-09-07T16:09:06Z","title":"AnthroNet: Conditional Generation of Humans via Anthropometrics","summary":" We present a novel human body model formulated by an extensive set of\nanthropocentric measurements, which is capable of generating a wide range of\nhuman body shapes and poses. The proposed model enables direct modeling of\nspecific human identities through a deep generative architecture, which can\nproduce humans in any arbitrary pose. It is the first of its kind to have been\ntrained end-to-end using only synthetically generated data, which not only\nprovides highly accurate human mesh representations but also allows for precise\nanthropometry of the body. Moreover, using a highly diverse animation library,\nwe articulated our synthetic humans' body and hands to maximize the diversity\nof the learnable priors for model training. Our model was trained on a dataset\nof $100k$ procedurally-generated posed human meshes and their corresponding\nanthropometric measurements. Our synthetic data generator can be used to\ngenerate millions of unique human identities and poses for non-commercial\nacademic research purposes.\n","authors":["Francesco Picetti","Shrinath Deshpande","Jonathan Leban","Soroosh Shahtalebi","Jay Patel","Peifeng Jing","Chunpu Wang","Charles Metze III","Cameron Sun","Cera Laidlaw","James Warren","Kathy Huynh","River Page","Jonathan Hogins","Adam Crespi","Sujoy Ganguly","Salehe Erfanian Ebadi"],"pdf_url":"https://arxiv.org/pdf/2309.03812v1.pdf","comment":"AnthroNet's Unity data generator source code is available at:\n https://unity-technologies.github.io/AnthroNet/"},{"id":"http://arxiv.org/abs/2309.03808v1","updated":"2023-09-07T16:01:47Z","published":"2023-09-07T16:01:47Z","title":"Improved theoretical guarantee for rank aggregation via spectral method","summary":" Given pairwise comparisons between multiple items, how to rank them so that\nthe ranking matches the observations? This problem, known as rank aggregation,\nhas found many applications in sports, recommendation systems, and other web\napplications. As it is generally NP-hard to find a global ranking that\nminimizes the mismatch (known as the Kemeny optimization), we focus on the\nErd\\\"os-R\\'enyi outliers (ERO) model for this ranking problem. Here, each\npairwise comparison is a corrupted copy of the true score difference. We\ninvestigate spectral ranking algorithms that are based on unnormalized and\nnormalized data matrices. The key is to understand their performance in\nrecovering the underlying scores of each item from the observed data. This\nreduces to deriving an entry-wise perturbation error bound between the top\neigenvectors of the unnormalized/normalized data matrix and its population\ncounterpart. By using the leave-one-out technique, we provide a sharper\n$\\ell_{\\infty}$-norm perturbation bound of the eigenvectors and also derive an\nerror bound on the maximum displacement for each item, with only $\\Omega(n\\log\nn)$ samples. Our theoretical analysis improves upon the state-of-the-art\nresults in terms of sample complexity, and our numerical experiments confirm\nthese theoretical findings.\n","authors":["Ziliang Samuel Zhong","Shuyang Ling"],"pdf_url":"https://arxiv.org/pdf/2309.03808v1.pdf","comment":"29 pages, 6 figures"},{"id":"http://arxiv.org/abs/2212.01448v2","updated":"2023-09-07T16:01:15Z","published":"2022-12-02T21:16:39Z","title":"PGFed: Personalize Each Client's Global Objective for Federated Learning","summary":" Personalized federated learning has received an upsurge of attention due to\nthe mediocre performance of conventional federated learning (FL) over\nheterogeneous data. Unlike conventional FL which trains a single global\nconsensus model, personalized FL allows different models for different clients.\nHowever, existing personalized FL algorithms only implicitly transfer the\ncollaborative knowledge across the federation by embedding the knowledge into\nthe aggregated model or regularization. We observed that this implicit\nknowledge transfer fails to maximize the potential of each client's empirical\nrisk toward other clients. Based on our observation, in this work, we propose\nPersonalized Global Federated Learning (PGFed), a novel personalized FL\nframework that enables each client to personalize its own global objective by\nexplicitly and adaptively aggregating the empirical risks of itself and other\nclients. To avoid massive (O(N^2)) communication overhead and potential privacy\nleakage while achieving this, each client's risk is estimated through a\nfirst-order approximation for other clients' adaptive risk aggregation. On top\nof PGFed, we develop a momentum upgrade, dubbed PGFedMo, to more efficiently\nutilize clients' empirical risks. Our extensive experiments on four datasets\nunder different federated settings show consistent improvements of PGFed over\nprevious state-of-the-art methods. The code is publicly available at\nhttps://github.com/ljaiverson/pgfed.\n","authors":["Jun Luo","Matias Mendieta","Chen Chen","Shandong Wu"],"pdf_url":"https://arxiv.org/pdf/2212.01448v2.pdf","comment":"ICCV 2023 oral"},{"id":"http://arxiv.org/abs/2308.03944v2","updated":"2023-09-07T15:59:20Z","published":"2023-08-07T23:19:34Z","title":"GraPhSyM: Graph Physical Synthesis Model","summary":" In this work, we introduce GraPhSyM, a Graph Attention Network (GATv2) model\nfor fast and accurate estimation of post-physical synthesis circuit delay and\narea metrics from pre-physical synthesis circuit netlists. Once trained,\nGraPhSyM provides accurate visibility of final design metrics to early EDA\nstages, such as logic synthesis, without running the slow physical synthesis\nflow, enabling global co-optimization across stages. Additionally, the swift\nand precise feedback provided by GraPhSyM is instrumental for\nmachine-learning-based EDA optimization frameworks. Given a gate-level netlist\nof a circuit represented as a graph, GraPhSyM utilizes graph structure,\nconnectivity, and electrical property features to predict the impact of\nphysical synthesis transformations such as buffer insertion and gate sizing.\nWhen trained on a dataset of 6000 prefix adder designs synthesized at an\naggressive delay target, GraPhSyM can accurately predict the post-synthesis\ndelay (98.3%) and area (96.1%) metrics of unseen adders with a fast 0.22s\ninference time. Furthermore, we illustrate the compositionality of GraPhSyM by\nemploying the model trained on a fixed delay target to accurately anticipate\npost-synthesis metrics at a variety of unseen delay targets. Lastly, we report\npromising generalization capabilities of the GraPhSyM model when it is\nevaluated on circuits different from the adders it was exclusively trained on.\nThe results show the potential for GraPhSyM to serve as a powerful tool for\nadvanced optimization techniques and as an oracle for EDA machine learning\nframeworks.\n","authors":["Ahmed Agiza","Rajarshi Roy","Teodor Dumitru Ene","Saad Godil","Sherief Reda","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2308.03944v2.pdf","comment":"Accepted at Proceedings of the 42nd International Conference on\n Computer-Aided Design (ICCAD), 2023"},{"id":"http://arxiv.org/abs/2309.03800v1","updated":"2023-09-07T15:52:48Z","published":"2023-09-07T15:52:48Z","title":"Pareto Frontiers in Neural Feature Learning: Data, Compute, Width, and\n Luck","summary":" This work investigates the nuanced algorithm design choices for deep learning\nin the presence of computational-statistical gaps. We begin by considering\noffline sparse parity learning, a supervised classification problem which\nadmits a statistical query lower bound for gradient-based training of a\nmultilayer perceptron. This lower bound can be interpreted as a multi-resource\ntradeoff frontier: successful learning can only occur if one is sufficiently\nrich (large model), knowledgeable (large dataset), patient (many training\niterations), or lucky (many random guesses). We show, theoretically and\nexperimentally, that sparse initialization and increasing network width yield\nsignificant improvements in sample efficiency in this setting. Here, width\nplays the role of parallel search: it amplifies the probability of finding\n\"lottery ticket\" neurons, which learn sparse features more sample-efficiently.\nFinally, we show that the synthetic sparse parity task can be useful as a proxy\nfor real problems requiring axis-aligned feature learning. We demonstrate\nimproved sample efficiency on tabular classification benchmarks by using wide,\nsparsely-initialized MLP models; these networks sometimes outperform tuned\nrandom forests.\n","authors":["Benjamin L. Edelman","Surbhi Goel","Sham Kakade","Eran Malach","Cyril Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.03800v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1907.04483v2","updated":"2023-09-07T15:51:56Z","published":"2019-07-08T00:20:25Z","title":"Copula Representations and Error Surface Projections for the Exclusive\n Or Problem","summary":" The exclusive or (xor) function is one of the simplest examples that\nillustrate why nonlinear feedforward networks are superior to linear regression\nfor machine learning applications. We review the xor representation and\napproximation problems and discuss their solutions in terms of probabilistic\nlogic and associative copula functions. After briefly reviewing the\nspecification of feedforward networks, we compare the dynamics of learned error\nsurfaces with different activation functions such as RELU and tanh through a\nset of colorful three-dimensional charts. The copula representations extend xor\nfrom Boolean to real values, thereby providing a convenient way to demonstrate\nthe concept of cross-validation on in-sample and out-sample data sets. Our\napproach is pedagogical and is meant to be a machine learning prolegomenon.\n","authors":["Roy S. Freedman"],"pdf_url":"https://arxiv.org/pdf/1907.04483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03797v1","updated":"2023-09-07T15:50:48Z","published":"2023-09-07T15:50:48Z","title":"Conformal Autoregressive Generation: Beam Search with Coverage\n Guarantees","summary":" We introduce two new extensions to the beam search algorithm based on\nconformal predictions (CP) to produce sets of sequences with theoretical\ncoverage guarantees. The first method is very simple and proposes\ndynamically-sized subsets of beam search results but, unlike typical CP\nprocedures, has an upper bound on the achievable guarantee depending on a\npost-hoc calibration measure. Our second algorithm introduces the conformal set\nprediction procedure as part of the decoding process, producing a variable beam\nwidth which adapts to the current uncertainty. While more complex, this\nprocedure can achieve coverage guarantees selected a priori. We provide\nmarginal coverage bounds for each method, and evaluate them empirically on a\nselection of tasks drawing from natural language processing and chemistry.\n","authors":["Nicolas Deutschmann","Marvin Alberts","María Rodríguez Martínez"],"pdf_url":"https://arxiv.org/pdf/2309.03797v1.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2106.02613v4","updated":"2023-09-07T15:50:30Z","published":"2021-06-04T17:21:07Z","title":"Bridging the Gap Between Target Networks and Functional Regularization","summary":" Bootstrapping is behind much of the successes of deep Reinforcement Learning.\nHowever, learning the value function via bootstrapping often leads to unstable\ntraining due to fast-changing target values. Target Networks are employed to\nstabilize training by using an additional set of lagging parameters to estimate\nthe target values. Despite the popularity of Target Networks, their effect on\nthe optimization is still misunderstood. In this work, we show that they act as\nan implicit regularizer which can be beneficial in some cases, but also have\ndisadvantages such as being inflexible and can result in instabilities, even\nwhen vanilla TD(0) converges. To overcome these issues, we propose an explicit\nFunctional Regularization alternative that is flexible and a convex regularizer\nin function space and we theoretically study its convergence. We conduct an\nexperimental study across a range of environments, discount factors, and\noff-policiness data collections to investigate the effectiveness of the\nregularization induced by Target Networks and Functional Regularization in\nterms of performance, accuracy, and stability. Our findings emphasize that\nFunctional Regularization can be used as a drop-in replacement for Target\nNetworks and result in performance improvement. Furthermore, adjusting both the\nregularization weight and the network update period in Functional\nRegularization can result in further performance improvements compared to\nsolely adjusting the network update period as typically done with Target\nNetworks. Our approach also enhances the ability to networks to recover\naccurate $Q$-values.\n","authors":["Alexandre Piché","Valentin Thomas","Rafael Pardinas","Joseph Marino","Gian Maria Marconi","Christopher Pal","Mohammad Emtiyaz Khan"],"pdf_url":"https://arxiv.org/pdf/2106.02613v4.pdf","comment":"The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2309.03791v1","updated":"2023-09-07T15:41:45Z","published":"2023-09-07T15:41:45Z","title":"Adversarially Robust Deep Learning with Optimal-Transport-Regularized\n Divergences","summary":" We introduce the $ARMOR_D$ methods as novel approaches to enhancing the\nadversarial robustness of deep learning models. These methods are based on a\nnew class of optimal-transport-regularized divergences, constructed via an\ninfimal convolution between an information divergence and an optimal-transport\n(OT) cost. We use these as tools to enhance adversarial robustness by\nmaximizing the expected loss over a neighborhood of distributions, a technique\nknown as distributionally robust optimization. Viewed as a tool for\nconstructing adversarial samples, our method allows samples to be both\ntransported, according to the OT cost, and re-weighted, according to the\ninformation divergence. We demonstrate the effectiveness of our method on\nmalware detection and image recognition applications and find that, to our\nknowledge, it outperforms existing methods at enhancing the robustness against\nadversarial attacks. $ARMOR_D$ yields the robustified accuracy of $98.29\\%$\nagainst $FGSM$ and $98.18\\%$ against $PGD^{40}$ on the MNIST dataset, reducing\nthe error rate by more than $19.7\\%$ and $37.2\\%$ respectively compared to\nprior methods. Similarly, in malware detection, a discrete (binary) data\ndomain, $ARMOR_D$ improves the robustified accuracy under $rFGSM^{50}$ attack\ncompared to the previous best-performing adversarial training methods by\n$37.0\\%$ while lowering false negative and false positive rates by $51.1\\%$ and\n$57.53\\%$, respectively.\n","authors":["Jeremiah Birrell","Mohammadreza Ebrahimi"],"pdf_url":"https://arxiv.org/pdf/2309.03791v1.pdf","comment":"30 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.03779v1","updated":"2023-09-07T15:28:03Z","published":"2023-09-07T15:28:03Z","title":"CPU frequency scheduling of real-time applications on embedded devices\n with temporal encoding-based deep reinforcement learning","summary":" Small devices are frequently used in IoT and smart-city applications to\nperform periodic dedicated tasks with soft deadlines. This work focuses on\ndeveloping methods to derive efficient power-management methods for periodic\ntasks on small devices. We first study the limitations of the existing Linux\nbuilt-in methods used in small devices. We illustrate three typical\nworkload/system patterns that are challenging to manage with Linux's built-in\nsolutions. We develop a reinforcement-learning-based technique with temporal\nencoding to derive an effective DVFS governor even with the presence of the\nthree system patterns. The derived governor uses only one performance counter,\nthe same as the built-in Linux mechanism, and does not require an explicit task\nmodel for the workload. We implemented a prototype system on the Nvidia Jetson\nNano Board and experimented with it with six applications, including two\nself-designed and four benchmark applications. Under different deadline\nconstraints, our approach can quickly derive a DVFS governor that can adapt to\nperformance requirements and outperform the built-in Linux approach in energy\nsaving. On Mibench workloads, with performance slack ranging from 0.04 s to 0.4\ns, the proposed method can save 3% - 11% more energy compared to Ondemand.\nAudioReg and FaceReg applications tested have 5%- 14% energy-saving\nimprovement. We have open-sourced the implementation of our in-kernel quantized\nneural network engine. The codebase can be found at:\nhttps://github.com/coladog/tinyagent.\n","authors":["Ti Zhou","Man Lin"],"pdf_url":"https://arxiv.org/pdf/2309.03779v1.pdf","comment":"Accepted to Journal of Systems Architecture"},{"id":"http://arxiv.org/abs/2309.03774v1","updated":"2023-09-07T15:25:47Z","published":"2023-09-07T15:25:47Z","title":"Deep Learning Safety Concerns in Automated Driving Perception","summary":" Recent advances in the field of deep learning and impressive performance of\ndeep neural networks (DNNs) for perception have resulted in an increased demand\nfor their use in automated driving (AD) systems. The safety of such systems is\nof utmost importance and thus requires to consider the unique properties of\nDNNs.\n In order to achieve safety of AD systems with DNN-based perception components\nin a systematic and comprehensive approach, so-called safety concerns have been\nintroduced as a suitable structuring element. On the one hand, the concept of\nsafety concerns is -- by design -- well aligned to existing standards relevant\nfor safety of AD systems such as ISO 21448 (SOTIF). On the other hand, it has\nalready inspired several academic publications and upcoming standards on AI\nsafety such as ISO PAS 8800.\n While the concept of safety concerns has been previously introduced, this\npaper extends and refines it, leveraging feedback from various domain and\nsafety experts in the field. In particular, this paper introduces an additional\ncategorization for a better understanding as well as enabling cross-functional\nteams to jointly address the concerns.\n","authors":["Stephanie Abrecht","Alexander Hirsch","Shervin Raafatnia","Matthias Woehrle"],"pdf_url":"https://arxiv.org/pdf/2309.03774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02976v2","updated":"2023-09-07T15:23:29Z","published":"2023-09-06T13:20:31Z","title":"Natural and Robust Walking using Reinforcement Learning without\n Demonstrations in High-Dimensional Musculoskeletal Models","summary":" Humans excel at robust bipedal walking in complex natural environments. In\neach step, they adequately tune the interaction of biomechanical muscle\ndynamics and neuronal signals to be robust against uncertainties in ground\nconditions. However, it is still not fully understood how the nervous system\nresolves the musculoskeletal redundancy to solve the multi-objective control\nproblem considering stability, robustness, and energy efficiency. In computer\nsimulations, energy minimization has been shown to be a successful optimization\ntarget, reproducing natural walking with trajectory optimization or\nreflex-based control methods. However, these methods focus on particular\nmotions at a time and the resulting controllers are limited when compensating\nfor perturbations. In robotics, reinforcement learning~(RL) methods recently\nachieved highly stable (and efficient) locomotion on quadruped systems, but the\ngeneration of human-like walking with bipedal biomechanical models has required\nextensive use of expert data sets. This strong reliance on demonstrations often\nresults in brittle policies and limits the application to new behaviors,\nespecially considering the potential variety of movements for high-dimensional\nmusculoskeletal models in 3D. Achieving natural locomotion with RL without\nsacrificing its incredible robustness might pave the way for a novel approach\nto studying human walking in complex natural environments. Videos:\nhttps://sites.google.com/view/naturalwalkingrl\n","authors":["Pierre Schumacher","Thomas Geijtenbeek","Vittorio Caggiano","Vikash Kumar","Syn Schmitt","Georg Martius","Daniel F. B. Haeufle"],"pdf_url":"https://arxiv.org/pdf/2309.02976v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03770v1","updated":"2023-09-07T15:17:10Z","published":"2023-09-07T15:17:10Z","title":"Neural lasso: a unifying approach of lasso and neural networks","summary":" In recent years, there is a growing interest in combining techniques\nattributed to the areas of Statistics and Machine Learning in order to obtain\nthe benefits of both approaches. In this article, the statistical technique\nlasso for variable selection is represented through a neural network. It is\nobserved that, although both the statistical approach and its neural version\nhave the same objective function, they differ due to their optimization. In\nparticular, the neural version is usually optimized in one-step using a single\nvalidation set, while the statistical counterpart uses a two-step optimization\nbased on cross-validation. The more elaborated optimization of the statistical\nmethod results in more accurate parameter estimation, especially when the\ntraining set is small. For this reason, a modification of the standard approach\nfor training neural networks, that mimics the statistical framework, is\nproposed. During the development of the above modification, a new optimization\nalgorithm for identifying the significant variables emerged. Experimental\nresults, using synthetic and real data sets, show that this new optimization\nalgorithm achieves better performance than any of the three previous\noptimization approaches.\n","authors":["David Delgado","Ernesto Curbelo","Danae Carreras"],"pdf_url":"https://arxiv.org/pdf/2309.03770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03759v1","updated":"2023-09-07T15:00:58Z","published":"2023-09-07T15:00:58Z","title":"M(otion)-mode Based Prediction of Ejection Fraction using\n Echocardiograms","summary":" Early detection of cardiac dysfunction through routine screening is vital for\ndiagnosing cardiovascular diseases. An important metric of cardiac function is\nthe left ventricular ejection fraction (EF), where lower EF is associated with\ncardiomyopathy. Echocardiography is a popular diagnostic tool in cardiology,\nwith ultrasound being a low-cost, real-time, and non-ionizing technology.\nHowever, human assessment of echocardiograms for calculating EF is\ntime-consuming and expertise-demanding, raising the need for an automated\napproach. In this work, we propose using the M(otion)-mode of echocardiograms\nfor estimating the EF and classifying cardiomyopathy. We generate multiple\nartificial M-mode images from a single echocardiogram and combine them using\noff-the-shelf model architectures. Additionally, we extend contrastive learning\n(CL) to cardiac imaging to learn meaningful representations from exploiting\nstructures in unlabeled data allowing the model to achieve high accuracy, even\nwith limited annotations. Our experiments show that the supervised setting\nconverges with only ten modes and is comparable to the baseline method while\nbypassing its cumbersome training process and being computationally much more\nefficient. Furthermore, CL using M-mode images is helpful for limited data\nscenarios, such as having labels for only 200 patients, which is common in\nmedical applications.\n","authors":["Ece Ozkan","Thomas M. Sutter","Yurong Hu","Sebastian Balzer","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2309.03759v1.pdf","comment":"Accepted at GCPR 2023"},{"id":"http://arxiv.org/abs/2309.03755v1","updated":"2023-09-07T14:51:42Z","published":"2023-09-07T14:51:42Z","title":"TSGBench: Time Series Generation Benchmark","summary":" Synthetic Time Series Generation (TSG) is crucial in a range of applications,\nincluding data augmentation, anomaly detection, and privacy preservation.\nAlthough significant strides have been made in this field, existing methods\nexhibit three key limitations: (1) They often benchmark against similar model\ntypes, constraining a holistic view of performance capabilities. (2) The use of\nspecialized synthetic and private datasets introduces biases and hampers\ngeneralizability. (3) Ambiguous evaluation measures, often tied to custom\nnetworks or downstream tasks, hinder consistent and fair comparison.\n To overcome these limitations, we introduce \\textsf{TSGBench}, the inaugural\nTSG Benchmark, designed for a unified and comprehensive assessment of TSG\nmethods. It comprises three modules: (1) a curated collection of publicly\navailable, real-world datasets tailored for TSG, together with a standardized\npreprocessing pipeline; (2) a comprehensive evaluation measures suite including\nvanilla measures, new distance-based assessments, and visualization tools; (3)\na pioneering generalization test rooted in Domain Adaptation (DA), compatible\nwith all methods. We have conducted extensive experiments across ten real-world\ndatasets from diverse domains, utilizing ten advanced TSG methods and twelve\nevaluation measures, all gauged through \\textsf{TSGBench}. The results\nhighlight its remarkable efficacy and consistency. More importantly,\n\\textsf{TSGBench} delivers a statistical breakdown of method rankings,\nilluminating performance variations across different datasets and measures, and\noffering nuanced insights into the effectiveness of each method.\n","authors":["Yihao Ang","Qiang Huang","Yifan Bao","Anthony K. H. Tung","Zhiyong Huang"],"pdf_url":"https://arxiv.org/pdf/2309.03755v1.pdf","comment":"14 pages, 8 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2309.03754v1","updated":"2023-09-07T14:50:31Z","published":"2023-09-07T14:50:31Z","title":"Convergence Analysis of Decentralized ASGD","summary":" Over the last decades, Stochastic Gradient Descent (SGD) has been intensively\nstudied by the Machine Learning community. Despite its versatility and\nexcellent performance, the optimization of large models via SGD still is a\ntime-consuming task. To reduce training time, it is common to distribute the\ntraining process across multiple devices. Recently, it has been shown that the\nconvergence of asynchronous SGD (ASGD) will always be faster than mini-batch\nSGD. However, despite these improvements in the theoretical bounds, most ASGD\nconvergence-rate proofs still rely on a centralized parameter server, which is\nprone to become a bottleneck when scaling out the gradient computations across\nmany distributed processes.\n In this paper, we present a novel convergence-rate analysis for decentralized\nand asynchronous SGD (DASGD) which does not require partial synchronization\namong nodes nor restrictive network topologies. Specifically, we provide a\nbound of $\\mathcal{O}(\\sigma\\epsilon^{-2}) +\n\\mathcal{O}(QS_{avg}\\epsilon^{-3/2}) + \\mathcal{O}(S_{avg}\\epsilon^{-1})$ for\nthe convergence rate of DASGD, where $S_{avg}$ is the average staleness between\nmodels, $Q$ is a constant that bounds the norm of the gradients, and $\\epsilon$\nis a (small) error that is allowed within the bound. Furthermore, when\ngradients are not bounded, we prove the convergence rate of DASGD to be\n$\\mathcal{O}(\\sigma\\epsilon^{-2}) +\n\\mathcal{O}(\\sqrt{\\hat{S}_{avg}\\hat{S}_{max}}\\epsilon^{-1})$, with\n$\\hat{S}_{max}$ and $\\hat{S}_{avg}$ representing a loose version of the average\nand maximum staleness, respectively. Our convergence proof holds for a fixed\nstepsize and any non-convex, homogeneous, and L-smooth objective function. We\nanticipate that our results will be of high relevance for the adoption of DASGD\nby a broad community of researchers and developers.\n","authors":["Mauro DL Tosi","Martin Theobald"],"pdf_url":"https://arxiv.org/pdf/2309.03754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03751v1","updated":"2023-09-07T14:46:48Z","published":"2023-09-07T14:46:48Z","title":"Medoid Silhouette clustering with automatic cluster number selection","summary":" The evaluation of clustering results is difficult, highly dependent on the\nevaluated data set and the perspective of the beholder. There are many\ndifferent clustering quality measures, which try to provide a general measure\nto validate clustering results. A very popular measure is the Silhouette. We\ndiscuss the efficient medoid-based variant of the Silhouette, perform a\ntheoretical analysis of its properties, provide two fast versions for the\ndirect optimization, and discuss the use to choose the optimal number of\nclusters. We combine ideas from the original Silhouette with the well-known PAM\nalgorithm and its latest improvements FasterPAM. One of the versions guarantees\nequal results to the original variant and provides a run speedup of $O(k^2)$.\nIn experiments on real data with 30000 samples and $k$=100, we observed a\n10464$\\times$ speedup compared to the original PAMMEDSIL algorithm.\nAdditionally, we provide a variant to choose the optimal number of clusters\ndirectly.\n","authors":["Lars Lenssen","Erich Schubert"],"pdf_url":"https://arxiv.org/pdf/2309.03751v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2209.12553"},{"id":"http://arxiv.org/abs/2309.03748v1","updated":"2023-09-07T14:43:17Z","published":"2023-09-07T14:43:17Z","title":"Enhancing Pipeline-Based Conversational Agents with Large Language\n Models","summary":" The latest advancements in AI and deep learning have led to a breakthrough in\nlarge language model (LLM)-based agents such as GPT-4. However, many commercial\nconversational agent development tools are pipeline-based and have limitations\nin holding a human-like conversation. This paper investigates the capabilities\nof LLMs to enhance pipeline-based conversational agents during two phases: 1)\nin the design and development phase and 2) during operations. In 1) LLMs can\naid in generating training data, extracting entities and synonyms,\nlocalization, and persona design. In 2) LLMs can assist in contextualization,\nintent classification to prevent conversational breakdown and handle\nout-of-scope questions, auto-correcting utterances, rephrasing responses,\nformulating disambiguation questions, summarization, and enabling closed\nquestion-answering capabilities. We conducted informal experiments with GPT-4\nin the private banking domain to demonstrate the scenarios above with a\npractical example. Companies may be hesitant to replace their pipeline-based\nagents with LLMs entirely due to privacy concerns and the need for deep\nintegration within their existing ecosystems. A hybrid approach in which LLMs'\nare integrated into the pipeline-based agents allows them to save time and\ncosts of building and running agents by capitalizing on the capabilities of\nLLMs while retaining the integration and privacy safeguards of their existing\nsystems.\n","authors":["Mina Foosherian","Hendrik Purwins","Purna Rathnayake","Touhidul Alam","Rui Teimao","Klaus-Dieter Thoben"],"pdf_url":"https://arxiv.org/pdf/2309.03748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16215v3","updated":"2023-09-07T14:41:22Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control","summary":" Lossy video compression is commonly used when transmitting and storing video\ndata. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard,\ndespite the availability of advanced (neural) compression approaches.\nTransmitting videos in the face of dynamic network bandwidth conditions\nrequires video codecs to adapt to vastly different compression strengths. Rate\ncontrol modules augment the codec's compression such that bandwidth constraints\nare satisfied and video distortion is minimized. While, both standard video\ncodes and their rate control modules are developed to minimize video distortion\nw.r.t. human quality assessment, preserving the downstream performance of deep\nvision models is not considered. In this paper, we present the first end-to-end\nlearnable deep video codec control considering both bandwidth constraints and\ndownstream vision performance, while not breaking existing standardization. We\ndemonstrate for two common vision tasks (semantic segmentation and optical flow\nestimation) and on two different datasets that our deep codec control better\npreserves downstream performance than using 2-pass average bit rate control\nwhile meeting dynamic bandwidth constraints and adhering to standardizations.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v3.pdf","comment":"22 pages, 26 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.03731v1","updated":"2023-09-07T14:17:44Z","published":"2023-09-07T14:17:44Z","title":"Learning continuous-valued treatment effects through representation\n balancing","summary":" Estimating the effects of treatments with an associated dose on an instance's\noutcome, the \"dose response\", is relevant in a variety of domains, from\nhealthcare to business, economics, and beyond. Such effects, also known as\ncontinuous-valued treatment effects, are typically estimated from observational\ndata, which may be subject to dose selection bias. This means that the\nallocation of doses depends on pre-treatment covariates. Previous studies have\nshown that conventional machine learning approaches fail to learn accurate\nindividual estimates of dose responses under the presence of dose selection\nbias. In this work, we propose CBRNet, a causal machine learning approach to\nestimate an individual dose response from observational data. CBRNet adopts the\nNeyman-Rubin potential outcome framework and extends the concept of balanced\nrepresentation learning for overcoming selection bias to continuous-valued\ntreatments. Our work is the first to apply representation balancing in a\ncontinuous-valued treatment setting. We evaluate our method on a newly proposed\nbenchmark. Our experiments demonstrate CBRNet's ability to accurately learn\ntreatment effects under selection bias and competitive performance with respect\nto other state-of-the-art methods.\n","authors":["Christopher Bockel-Rickermann","Toon Vanderschueren","Jeroen Berrevoets","Tim Verdonck","Wouter Verbeke"],"pdf_url":"https://arxiv.org/pdf/2309.03731v1.pdf","comment":"24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.03730v1","updated":"2023-09-07T14:14:30Z","published":"2023-09-07T14:14:30Z","title":"A Causal Perspective on Loan Pricing: Investigating the Impacts of\n Selection Bias on Identifying Bid-Response Functions","summary":" In lending, where prices are specific to both customers and products, having\na well-functioning personalized pricing policy in place is essential to\neffective business making. Typically, such a policy must be derived from\nobservational data, which introduces several challenges. While the problem of\n``endogeneity'' is prominently studied in the established pricing literature,\nthe problem of selection bias (or, more precisely, bid selection bias) is not.\nWe take a step towards understanding the effects of selection bias by posing\npricing as a problem of causal inference. Specifically, we consider the\nreaction of a customer to price a treatment effect. In our experiments, we\nsimulate varying levels of selection bias on a semi-synthetic dataset on\nmortgage loan applications in Belgium. We investigate the potential of\nparametric and nonparametric methods for the identification of individual\nbid-response functions. Our results illustrate how conventional methods such as\nlogistic regression and neural networks suffer adversely from selection bias.\nIn contrast, we implement state-of-the-art methods from causal machine learning\nand show their capability to overcome selection bias in pricing data.\n","authors":["Christopher Bockel-Rickermann","Sam Verboven","Tim Verdonck","Wouter Verbeke"],"pdf_url":"https://arxiv.org/pdf/2309.03730v1.pdf","comment":"24 pages, 5 figures"},{"id":"http://arxiv.org/abs/2202.09671v4","updated":"2023-09-07T14:08:07Z","published":"2022-02-19T20:18:49Z","title":"Truncated Diffusion Probabilistic Models and Diffusion-based Adversarial\n Auto-Encoders","summary":" Employing a forward diffusion chain to gradually map the data to a noise\ndistribution, diffusion-based generative models learn how to generate the data\nby inferring a reverse diffusion chain. However, this approach is slow and\ncostly because it needs many forward and reverse steps. We propose a faster and\ncheaper approach that adds noise not until the data become pure random noise,\nbut until they reach a hidden noisy data distribution that we can confidently\nlearn. Then, we use fewer reverse steps to generate data by starting from this\nhidden distribution that is made similar to the noisy data. We reveal that the\nproposed model can be cast as an adversarial auto-encoder empowered by both the\ndiffusion process and a learnable implicit prior. Experimental results show\neven with a significantly smaller number of reverse diffusion steps, the\nproposed truncated diffusion probabilistic models can provide consistent\nimprovements over the non-truncated ones in terms of performance in both\nunconditional and text-guided image generations.\n","authors":["Huangjie Zheng","Pengcheng He","Weizhu Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2202.09671v4.pdf","comment":"ICLR 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2309.03720v1","updated":"2023-09-07T13:52:20Z","published":"2023-09-07T13:52:20Z","title":"A Natural Gas Consumption Forecasting System for Continual Learning\n Scenarios based on Hoeffding Trees with Change Point Detection Mechanism","summary":" Forecasting natural gas consumption, considering seasonality and trends, is\ncrucial in planning its supply and consumption and optimizing the cost of\nobtaining it, mainly by industrial entities. However, in times of threats to\nits supply, it is also a critical element that guarantees the supply of this\nraw material to meet individual consumers' needs, ensuring society's energy\nsecurity. This article introduces a novel multistep ahead forecasting of\nnatural gas consumption with change point detection integration for model\ncollection selection with continual learning capabilities using data stream\nprocessing. The performance of the forecasting models based on the proposed\napproach is evaluated in a complex real-world use case of natural gas\nconsumption forecasting. We employed Hoeffding tree predictors as forecasting\nmodels and the Pruned Exact Linear Time (PELT) algorithm for the change point\ndetection procedure. The change point detection integration enables selecting a\ndifferent model collection for successive time frames. Thus, three model\ncollection selection procedures (with and without an error feedback loop) are\ndefined and evaluated for forecasting scenarios with various densities of\ndetected change points. These models were compared with change point agnostic\nbaseline approaches. Our experiments show that fewer change points result in a\nlower forecasting error regardless of the model collection selection procedure\nemployed. Also, simpler model collection selection procedures omitting\nforecasting error feedback leads to more robust forecasting models suitable for\ncontinual learning tasks.\n","authors":["Radek Svoboda","Sebastian Basterrech","Jędrzej Kozal","Jan Platoš","Michał Woźniak"],"pdf_url":"https://arxiv.org/pdf/2309.03720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02428v2","updated":"2023-09-07T13:42:57Z","published":"2023-09-05T17:56:22Z","title":"Enhancing Deep Learning Models through Tensorization: A Comprehensive\n Survey and Framework","summary":" The burgeoning growth of public domain data and the increasing complexity of\ndeep learning model architectures have underscored the need for more efficient\ndata representation and analysis techniques. This paper is motivated by the\nwork of Helal (2023) and aims to present a comprehensive overview of\ntensorization. This transformative approach bridges the gap between the\ninherently multidimensional nature of data and the simplified 2-dimensional\nmatrices commonly used in linear algebra-based machine learning algorithms.\nThis paper explores the steps involved in tensorization, multidimensional data\nsources, various multiway analysis methods employed, and the benefits of these\napproaches. A small example of Blind Source Separation (BSS) is presented\ncomparing 2-dimensional algorithms and a multiway algorithm in Python. Results\nindicate that multiway analysis is more expressive. Contrary to the intuition\nof the dimensionality curse, utilising multidimensional datasets in their\nnative form and applying multiway analysis methods grounded in multilinear\nalgebra reveal a profound capacity to capture intricate interrelationships\namong various dimensions while, surprisingly, reducing the number of model\nparameters and accelerating processing. A survey of the multi-away analysis\nmethods and integration with various Deep Neural Networks models is presented\nusing case studies in different domains.\n","authors":["Manal Helal"],"pdf_url":"https://arxiv.org/pdf/2309.02428v2.pdf","comment":"30 pages, 8 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.15223v2","updated":"2023-09-07T13:39:02Z","published":"2023-08-29T11:24:12Z","title":"Evaluating Explanation Methods for Multivariate Time Series\n Classification","summary":" Multivariate time series classification is an important computational task\narising in applications where data is recorded over time and over multiple\nchannels. For example, a smartwatch can record the acceleration and orientation\nof a person's motion, and these signals are recorded as multivariate time\nseries. We can classify this data to understand and predict human movement and\nvarious properties such as fitness levels. In many applications classification\nalone is not enough, we often need to classify but also understand what the\nmodel learns (e.g., why was a prediction given, based on what information in\nthe data). The main focus of this paper is on analysing and evaluating\nexplanation methods tailored to Multivariate Time Series Classification (MTSC).\nWe focus on saliency-based explanation methods that can point out the most\nrelevant channels and time series points for the classification decision. We\nanalyse two popular and accurate multivariate time series classifiers, ROCKET\nand dResNet, as well as two popular explanation methods, SHAP and dCAM. We\nstudy these methods on 3 synthetic datasets and 2 real-world datasets and\nprovide a quantitative and qualitative analysis of the explanations provided.\nWe find that flattening the multivariate datasets by concatenating the channels\nworks as well as using multivariate classifiers directly and adaptations of\nSHAP for MTSC work quite well. Additionally, we also find that the popular\nsynthetic datasets we used are not suitable for time series analysis.\n","authors":["Davide Italo Serramazza","Thu Trang Nguyen","Thach Le Nguyen","Georgiana Ifrim"],"pdf_url":"https://arxiv.org/pdf/2308.15223v2.pdf","comment":"Accepted at AALTD '23 (8th International Workshop on Advanced\n Analytics and Learning on Temporal Data, ECMLPKDD 2023)"},{"id":"http://arxiv.org/abs/2309.03710v1","updated":"2023-09-07T13:38:36Z","published":"2023-09-07T13:38:36Z","title":"A State Representation for Diminishing Rewards","summary":" A common setting in multitask reinforcement learning (RL) demands that an\nagent rapidly adapt to various stationary reward functions randomly sampled\nfrom a fixed distribution. In such situations, the successor representation\n(SR) is a popular framework which supports rapid policy evaluation by\ndecoupling a policy's expected discounted, cumulative state occupancies from a\nspecific reward function. However, in the natural world, sequential tasks are\nrarely independent, and instead reflect shifting priorities based on the\navailability and subjective perception of rewarding stimuli. Reflecting this\ndisjunction, in this paper we study the phenomenon of diminishing marginal\nutility and introduce a novel state representation, the $\\lambda$\nrepresentation ($\\lambda$R) which, surprisingly, is required for policy\nevaluation in this setting and which generalizes the SR as well as several\nother state representations from the literature. We establish the $\\lambda$R's\nformal properties and examine its normative advantages in the context of\nmachine learning, as well as its usefulness for studying natural behaviors,\nparticularly foraging.\n","authors":["Ted Moskovitz","Samo Hromadka","Ahmed Touati","Diana Borsa","Maneesh Sahani"],"pdf_url":"https://arxiv.org/pdf/2309.03710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03708v1","updated":"2023-09-07T13:36:03Z","published":"2023-09-07T13:36:03Z","title":"Chat Failures and Troubles: Reasons and Solutions","summary":" This paper examines some common problems in Human-Robot Interaction (HRI)\ncausing failures and troubles in Chat. A given use case's design decisions\nstart with the suitable robot, the suitable chatting model, identifying common\nproblems that cause failures, identifying potential solutions, and planning\ncontinuous improvement. In conclusion, it is recommended to use a closed-loop\ncontrol algorithm that guides the use of trained Artificial Intelligence (AI)\npre-trained models and provides vocabulary filtering, re-train batched models\non new datasets, learn online from data streams, and/or use reinforcement\nlearning models to self-update the trained models and reduce errors.\n","authors":["Manal Helal","Patrick Holthaus","Gabriella Lakatos","Farshid Amirabdollahian"],"pdf_url":"https://arxiv.org/pdf/2309.03708v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2309.03707v1","updated":"2023-09-07T13:34:20Z","published":"2023-09-07T13:34:20Z","title":"A Probabilistic Semi-Supervised Approach with Triplet Markov Chains","summary":" Triplet Markov chains are general generative models for sequential data which\ntake into account three kinds of random variables: (noisy) observations, their\nassociated discrete labels and latent variables which aim at strengthening the\ndistribution of the observations and their associated labels. However, in\npractice, we do not have at our disposal all the labels associated to the\nobservations to estimate the parameters of such models. In this paper, we\npropose a general framework based on a variational Bayesian inference to train\nparameterized triplet Markov chain models in a semi-supervised context. The\ngenerality of our approach enables us to derive semi-supervised algorithms for\na variety of generative models for sequential Bayesian classification.\n","authors":["Katherine Morales","Yohan Petetin"],"pdf_url":"https://arxiv.org/pdf/2309.03707v1.pdf","comment":"Preprint submitted to IEEE MLSP 2023"},{"id":"http://arxiv.org/abs/2309.03702v1","updated":"2023-09-07T13:28:36Z","published":"2023-09-07T13:28:36Z","title":"DiffDefense: Defending against Adversarial Attacks via Diffusion Models","summary":" This paper presents a novel reconstruction method that leverages Diffusion\nModels to protect machine learning classifiers against adversarial attacks, all\nwithout requiring any modifications to the classifiers themselves. The\nsusceptibility of machine learning models to minor input perturbations renders\nthem vulnerable to adversarial attacks. While diffusion-based methods are\ntypically disregarded for adversarial defense due to their slow reverse\nprocess, this paper demonstrates that our proposed method offers robustness\nagainst adversarial threats while preserving clean accuracy, speed, and\nplug-and-play compatibility. Code at:\nhttps://github.com/HondamunigePrasannaSilva/DiffDefence.\n","authors":["Hondamunige Prasanna Silva","Lorenzo Seidenari","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2309.03702v1.pdf","comment":"Paper published at ICIAP23"},{"id":"http://arxiv.org/abs/2010.10274v2","updated":"2023-09-07T13:27:29Z","published":"2020-10-20T13:45:47Z","title":"Graph Fairing Convolutional Networks for Anomaly Detection","summary":" Graph convolution is a fundamental building block for many deep neural\nnetworks on graph-structured data. In this paper, we introduce a simple, yet\nvery effective graph convolutional network with skip connections for\nsemi-supervised anomaly detection. The proposed layerwise propagation rule of\nour model is theoretically motivated by the concept of implicit fairing in\ngeometry processing, and comprises a graph convolution module for aggregating\ninformation from immediate node neighbors and a skip connection module for\ncombining layer-wise neighborhood representations. This propagation rule is\nderived from the iterative solution of the implicit fairing equation via the\nJacobi method. In addition to capturing information from distant graph nodes\nthrough skip connections between the network's layers, our approach exploits\nboth the graph structure and node features for learning discriminative node\nrepresentations. These skip connections are integrated by design in our\nproposed network architecture. The effectiveness of our model is demonstrated\nthrough extensive experiments on five benchmark datasets, achieving better or\ncomparable anomaly detection results against strong baseline methods. We also\ndemonstrate through an ablation study that skip connection helps improve the\nmodel performance.\n","authors":["Mahsa Mesgaran","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2010.10274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08272v2","updated":"2023-09-07T13:21:40Z","published":"2023-02-16T13:04:59Z","title":"Revisiting Hidden Representations in Transfer Learning for Medical\n Imaging","summary":" While a key component to the success of deep learning is the availability of\nmassive amounts of training data, medical image datasets are often limited in\ndiversity and size. Transfer learning has the potential to bridge the gap\nbetween related yet different domains. For medical applications, however, it\nremains unclear whether it is more beneficial to pre-train on natural or\nmedical images. We aim to shed light on this problem by comparing\ninitialization on ImageNet and RadImageNet on seven medical classification\ntasks. Our work includes a replication study, which yields results contrary to\npreviously published findings. In our experiments, ResNet50 models pre-trained\non ImageNet tend to outperform those trained on RadImageNet. To gain further\ninsights, we investigate the learned representations using Canonical\nCorrelation Analysis (CCA) and compare the predictions of the different models.\nOur results indicate that, contrary to intuition, ImageNet and RadImageNet may\nconverge to distinct intermediate representations, which appear to diverge\nfurther during fine-tuning. Despite these distinct representations, the\npredictions of the models remain similar. Our findings show that the similarity\nbetween networks before and after fine-tuning does not correlate with\nperformance gains, suggesting that the advantages of transfer learning might\nnot solely originate from the reuse of features in the early layers of a\nconvolutional neural network.\n","authors":["Dovile Juodelyte","Amelia Jiménez-Sánchez","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2302.08272v2.pdf","comment":"Submitted to TMLR"},{"id":"http://arxiv.org/abs/2309.03694v1","updated":"2023-09-07T13:06:52Z","published":"2023-09-07T13:06:52Z","title":"Short-Term Load Forecasting Using A Particle-Swarm Optimized Multi-Head\n Attention-Augmented CNN-LSTM Network","summary":" Short-term load forecasting is of paramount importance in the efficient\noperation and planning of power systems, given its inherent non-linear and\ndynamic nature. Recent strides in deep learning have shown promise in\naddressing this challenge. However, these methods often grapple with\nhyperparameter sensitivity, opaqueness in interpretability, and high\ncomputational overhead for real-time deployment. In this paper, I propose a\nnovel solution that surmounts these obstacles. Our approach harnesses the power\nof the Particle-Swarm Optimization algorithm to autonomously explore and\noptimize hyperparameters, a Multi-Head Attention mechanism to discern the\nsalient features crucial for accurate forecasting, and a streamlined framework\nfor computational efficiency. Our method undergoes rigorous evaluation using a\ngenuine electricity demand dataset. The results underscore its superiority in\nterms of accuracy, robustness, and computational efficiency. Notably, our Mean\nAbsolute Percentage Error of 1.9376 marks a significant advancement over\nexisting state-of-the-art approaches, heralding a new era in short-term load\nforecasting.\n","authors":["Paapa Kwesi Quansah"],"pdf_url":"https://arxiv.org/pdf/2309.03694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16360v2","updated":"2023-09-07T13:06:17Z","published":"2023-08-30T23:26:33Z","title":"Emoji Promotes Developer Participation and Issue Resolution on GitHub","summary":" Although remote working is increasingly adopted during the pandemic, many are\nconcerned by the low-efficiency in the remote working. Missing in text-based\ncommunication are non-verbal cues such as facial expressions and body language,\nwhich hinders the effective communication and negatively impacts the work\noutcomes. Prevalent on social media platforms, emojis, as alternative\nnon-verbal cues, are gaining popularity in the virtual workspaces well. In this\npaper, we study how emoji usage influences developer participation and issue\nresolution in virtual workspaces. To this end, we collect GitHub issues for a\none-year period and apply causal inference techniques to measure the causal\neffect of emojis on the outcome of issues, controlling for confounders such as\nissue content, repository, and author information. We find that emojis can\nsignificantly reduce the resolution time of issues and attract more user\nparticipation. We also compare the heterogeneous effect on different types of\nissues. These findings deepen our understanding of the developer communities,\nand they provide design implications on how to facilitate interactions and\nbroaden developer participation.\n","authors":["Yuhang Zhou","Xuan Lu","Ge Gao","Qiaozhu Mei","Wei Ai"],"pdf_url":"https://arxiv.org/pdf/2308.16360v2.pdf","comment":"12 pages, 5 figures. To be published in the 18th International AAAI\n Conference on Web and Social Media (ICWSM 2024)"},{"id":"http://arxiv.org/abs/2306.06145v2","updated":"2023-09-07T12:56:49Z","published":"2023-06-09T10:34:18Z","title":"LDMRes-Net: Enabling Efficient Medical Image Segmentation on IoT and\n Edge Platforms","summary":" In this study, we propose LDMRes-Net, a lightweight dual-multiscale residual\nblock-based computational neural network tailored for medical image\nsegmentation on IoT and edge platforms. Conventional U-Net-based models face\nchallenges in meeting the speed and efficiency demands of real-time clinical\napplications, such as disease monitoring, radiation therapy, and image-guided\nsurgery. LDMRes-Net overcomes these limitations with its remarkably low number\nof learnable parameters (0.072M), making it highly suitable for\nresource-constrained devices. The model's key innovation lies in its dual\nmulti-residual block architecture, which enables the extraction of refined\nfeatures on multiple scales, enhancing overall segmentation performance. To\nfurther optimize efficiency, the number of filters is carefully selected to\nprevent overlap, reduce training time, and improve computational efficiency.\nThe study includes comprehensive evaluations, focusing on segmentation of the\nretinal image of vessels and hard exudates crucial for the diagnosis and\ntreatment of ophthalmology. The results demonstrate the robustness,\ngeneralizability, and high segmentation accuracy of LDMRes-Net, positioning it\nas an efficient tool for accurate and rapid medical image segmentation in\ndiverse clinical applications, particularly on IoT and edge platforms. Such\nadvances hold significant promise for improving healthcare outcomes and\nenabling real-time medical image analysis in resource-limited settings.\n","authors":["Shahzaib Iqbal","Tariq M. Khan","Syed S. Naqvi","Muhammad Usman","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2306.06145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00452v2","updated":"2023-09-07T12:22:28Z","published":"2023-08-01T11:05:13Z","title":"A Majority Invariant Approach to Patch Robustness Certification for Deep\n Learning Models","summary":" Patch robustness certification ensures no patch within a given bound on a\nsample can manipulate a deep learning model to predict a different label.\nHowever, existing techniques cannot certify samples that cannot meet their\nstrict bars at the classifier or patch region levels. This paper proposes\nMajorCert. MajorCert firstly finds all possible label sets manipulatable by the\nsame patch region on the same sample across the underlying classifiers, then\nenumerates their combinations element-wise, and finally checks whether the\nmajority invariant of all these combinations is intact to certify samples.\n","authors":["Qilin Zhou","Zhengyuan Wei","Haipeng Wang","W. K. Chan"],"pdf_url":"https://arxiv.org/pdf/2308.00452v2.pdf","comment":"5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track"},{"id":"http://arxiv.org/abs/2309.03672v1","updated":"2023-09-07T12:21:22Z","published":"2023-09-07T12:21:22Z","title":"A computationally lightweight safe learning algorithm","summary":" Safety is an essential asset when learning control policies for physical\nsystems, as violating safety constraints during training can lead to expensive\nhardware damage. In response to this need, the field of safe learning has\nemerged with algorithms that can provide probabilistic safety guarantees\nwithout knowledge of the underlying system dynamics. Those algorithms often\nrely on Gaussian process inference. Unfortunately, Gaussian process inference\nscales cubically with the number of data points, limiting applicability to\nhigh-dimensional and embedded systems. In this paper, we propose a safe\nlearning algorithm that provides probabilistic safety guarantees but leverages\nthe Nadaraya-Watson estimator instead of Gaussian processes. For the\nNadaraya-Watson estimator, we can reach logarithmic scaling with the number of\ndata points. We provide theoretical guarantees for the estimates, embed them\ninto a safe learning algorithm, and show numerical experiments on a simulated\nseven-degrees-of-freedom robot manipulator.\n","authors":["Dominik Baumann","Krzysztof Kowalczyk","Koen Tiels","Paweł Wachel"],"pdf_url":"https://arxiv.org/pdf/2309.03672v1.pdf","comment":"Accepted final version to appear in: Proc. of the IEEE Conference on\n Decision and Control"},{"id":"http://arxiv.org/abs/2309.03671v1","updated":"2023-09-07T12:19:51Z","published":"2023-09-07T12:19:51Z","title":"Dataset Generation and Bonobo Classification from Weakly Labelled Videos","summary":" This paper presents a bonobo detection and classification pipeline built from\nthe commonly used machine learning methods. Such application is motivated by\nthe need to test bonobos in their enclosure using touch screen devices without\nhuman assistance. This work introduces a newly acquired dataset based on bonobo\nrecordings generated semi-automatically. The recordings are weakly labelled and\nfed to a macaque detector in order to spatially detect the individual present\nin the video. Handcrafted features coupled with different classification\nalgorithms and deep-learning methods using a ResNet architecture are\ninvestigated for bonobo identification. Performance is compared in terms of\nclassification accuracy on the splits of the database using different data\nseparation methods. We demonstrate the importance of data preparation and how a\nwrong data separation can lead to false good results. Finally, after a\nmeaningful separation of the data, the best classification performance is\nobtained using a fine-tuned ResNet model and reaches 75% of accuracy.\n","authors":["Pierre-Etienne Martin"],"pdf_url":"https://arxiv.org/pdf/2309.03671v1.pdf","comment":"IntelliSys 2023 paper"},{"id":"http://arxiv.org/abs/2309.03665v1","updated":"2023-09-07T12:02:00Z","published":"2023-09-07T12:02:00Z","title":"How adversarial attacks can disrupt seemingly stable accurate\n classifiers","summary":" Adversarial attacks dramatically change the output of an otherwise accurate\nlearning system using a seemingly inconsequential modification to a piece of\ninput data. Paradoxically, empirical evidence indicates that even systems which\nare robust to large random perturbations of the input data remain susceptible\nto small, easily constructed, adversarial perturbations of their inputs. Here,\nwe show that this may be seen as a fundamental feature of classifiers working\nwith high dimensional input data. We introduce a simple generic and\ngeneralisable framework for which key behaviours observed in practical systems\narise with high probability -- notably the simultaneous susceptibility of the\n(otherwise accurate) model to easily constructed adversarial attacks, and\nrobustness to random perturbations of the input data. We confirm that the same\nphenomena are directly observed in practical neural networks trained on\nstandard image classification problems, where even large additive random noise\nfails to trigger the adversarial instability of the network. A surprising\ntakeaway is that even small margins separating a classifier's decision surface\nfrom training and testing data can hide adversarial susceptibility from being\ndetected using randomly sampled perturbations. Counterintuitively, using\nadditive noise during training or testing is therefore inefficient for\neradicating or detecting adversarial examples, and more demanding adversarial\ntraining is required.\n","authors":["Oliver J. Sutton","Qinghua Zhou","Ivan Y. Tyukin","Alexander N. Gorban","Alexander Bastounis","Desmond J. Higham"],"pdf_url":"https://arxiv.org/pdf/2309.03665v1.pdf","comment":"11 pages, 8 figures, additional supplementary materials"},{"id":"http://arxiv.org/abs/2308.00904v2","updated":"2023-09-07T12:01:57Z","published":"2023-08-02T01:44:30Z","title":"VLUCI: Variational Learning of Unobserved Confounders for Counterfactual\n Inference","summary":" Causal inference plays a vital role in diverse domains like epidemiology,\nhealthcare, and economics. De-confounding and counterfactual prediction in\nobservational data has emerged as a prominent concern in causal inference\nresearch. While existing models tackle observed confounders, the presence of\nunobserved confounders remains a significant challenge, distorting causal\ninference and impacting counterfactual outcome accuracy. To address this, we\npropose a novel variational learning model of unobserved confounders for\ncounterfactual inference (VLUCI), which generates the posterior distribution of\nunobserved confounders. VLUCI relaxes the unconfoundedness assumption often\noverlooked by most causal inference methods. By disentangling observed and\nunobserved confounders, VLUCI constructs a doubly variational inference model\nto approximate the distribution of unobserved confounders, which are used for\ninferring more accurate counterfactual outcomes. Extensive experiments on\nsynthetic and semi-synthetic datasets demonstrate VLUCI's superior performance\nin inferring unobserved confounders. It is compatible with state-of-the-art\ncounterfactual inference models, significantly improving inference accuracy at\nboth group and individual levels. Additionally, VLUCI provides confidence\nintervals for counterfactual outcomes, aiding decision-making in risk-sensitive\ndomains. We further clarify the considerations when applying VLUCI to cases\nwhere unobserved confounders don't strictly conform to our model assumptions\nusing the public IHDP dataset as an example, highlighting the practical\nadvantages of VLUCI.\n","authors":["Yonghe Zhao","Qiang Huang","Siwei Wu","Yun Peng","Huiyan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.00904v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.03664v1","updated":"2023-09-07T12:01:01Z","published":"2023-09-07T12:01:01Z","title":"Alzheimer Disease Detection from Raman Spectroscopy of the Cerebrospinal\n Fluid via Topological Machine Learning","summary":" The cerebrospinal fluid (CSF) of 19 subjects who received a clinical\ndiagnosis of Alzheimer's disease (AD) as well as of 5 pathological controls\nhave been collected and analysed by Raman spectroscopy (RS). We investigated\nwhether the raw and preprocessed Raman spectra could be used to distinguish AD\nfrom controls. First, we applied standard Machine Learning (ML) methods\nobtaining unsatisfactory results. Then, we applied ML to a set of topological\ndescriptors extracted from raw spectra, achieving a very good classification\naccuracy (>87%). Although our results are preliminary, they indicate that RS\nand topological analysis together may provide an effective combination to\nconfirm or disprove a clinical diagnosis of AD. The next steps will include\nenlarging the dataset of CSF samples to validate the proposed method better\nand, possibly, to understand if topological data analysis could support the\ncharacterization of AD subtypes.\n","authors":["Francesco Conti","Martina Banchelli","Valentina Bessi","Cristina Cecchi","Fabrizio Chiti","Sara Colantonio","Cristiano D'Andrea","Marella de Angelis","Davide Moroni","Benedetta Nacmias","Maria Antonietta Pascali","Sandro Sorbi","Paolo Matteini"],"pdf_url":"https://arxiv.org/pdf/2309.03664v1.pdf","comment":"Accepter for inclusion in AITA 2023 (http://aita.isti.cnr.it/)"},{"id":"http://arxiv.org/abs/2309.03659v1","updated":"2023-09-07T11:56:23Z","published":"2023-09-07T11:56:23Z","title":"Towards Comparable Knowledge Distillation in Semantic Image Segmentation","summary":" Knowledge Distillation (KD) is one proposed solution to large model sizes and\nslow inference speed in semantic segmentation. In our research we identify 25\nproposed distillation loss terms from 14 publications in the last 4 years.\nUnfortunately, a comparison of terms based on published results is often\nimpossible, because of differences in training configurations. A good\nillustration of this problem is the comparison of two publications from 2022.\nUsing the same models and dataset, Structural and Statistical Texture\nDistillation (SSTKD) reports an increase of student mIoU of 4.54 and a final\nperformance of 29.19, while Adaptive Perspective Distillation (APD) only\nimproves student performance by 2.06 percentage points, but achieves a final\nperformance of 39.25. The reason for such extreme differences is often a\nsuboptimal choice of hyperparameters and a resulting underperformance of the\nstudent model used as reference point. In our work, we reveal problems of\ninsufficient hyperparameter tuning by showing that distillation improvements of\ntwo widely accepted frameworks, SKD and IFVD, vanish when hyperparameters are\noptimized sufficiently. To improve comparability of future research in the\nfield, we establish a solid baseline for three datasets and two student models\nand provide extensive information on hyperparameter tuning. We find that only\ntwo out of eight techniques can compete with our simple baseline on the ADE20K\ndataset.\n","authors":["Onno Niemann","Christopher Vox","Thorben Werner"],"pdf_url":"https://arxiv.org/pdf/2309.03659v1.pdf","comment":"Accepted by the ECML PKDD 2023 workshop track: Simplification,\n Compression, Efficiency, and Frugality for Artificial Intelligence (SCEFA).\n This preprint has not undergone peer review or any post-submission\n improvements or corrections"},{"id":"http://arxiv.org/abs/2308.13280v2","updated":"2023-09-07T11:46:17Z","published":"2023-08-25T10:02:26Z","title":"AtmoRep: A stochastic model of atmosphere dynamics using large scale\n representation learning","summary":" The atmosphere affects humans in a multitude of ways, from loss of life due\nto adverse weather effects to long-term social and economic impacts on\nsocieties. Computer simulations of atmospheric dynamics are, therefore, of\ngreat importance for the well-being of our and future generations. Here, we\npropose AtmoRep, a novel, task-independent stochastic computer model of\natmospheric dynamics that can provide skillful results for a wide range of\napplications. AtmoRep uses large-scale representation learning from artificial\nintelligence to determine a general description of the highly complex,\nstochastic dynamics of the atmosphere from the best available estimate of the\nsystem's historical trajectory as constrained by observations. This is enabled\nby a novel self-supervised learning objective and a unique ensemble that\nsamples from the stochastic model with a variability informed by the one in the\nhistorical record. The task-independent nature of AtmoRep enables skillful\nresults for a diverse set of applications without specifically training for\nthem and we demonstrate this for nowcasting, temporal interpolation, model\ncorrection, and counterfactuals. We also show that AtmoRep can be improved with\nadditional data, for example radar observations, and that it can be extended to\ntasks such as downscaling. Our work establishes that large-scale neural\nnetworks can provide skillful, task-independent models of atmospheric dynamics.\nWith this, they provide a novel means to make the large record of atmospheric\nobservations accessible for applications and for scientific inquiry,\ncomplementing existing simulations based on first principles.\n","authors":["Christian Lessig","Ilaria Luise","Bing Gong","Michael Langguth","Scarlet Stadler","Martin Schultz"],"pdf_url":"https://arxiv.org/pdf/2308.13280v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16900v2","updated":"2023-09-07T11:41:52Z","published":"2023-08-31T17:58:28Z","title":"Learning to Taste: A Multimodal Wine Dataset","summary":" We present WineSensed, a large multimodal wine dataset for studying the\nrelations between visual perception, language, and flavor. The dataset\nencompasses 897k images of wine labels and 824k reviews of wines curated from\nthe Vivino platform. It has over 350k unique vintages, annotated with year,\nregion, rating, alcohol percentage, price, and grape composition. We obtained\nfine-grained flavor annotations on a subset by conducting a wine-tasting\nexperiment with 256 participants who were asked to rank wines based on their\nsimilarity in flavor, resulting in more than 5k pairwise flavor distances. We\npropose a low-dimensional concept embedding algorithm that combines human\nexperience with automatic machine similarity kernels. We demonstrate that this\nshared concept embedding space improves upon separate embedding spaces for\ncoarse flavor classification (alcohol percentage, country, grape, price,\nrating) and aligns with the intricate human perception of flavor.\n","authors":["Thoranna Bender","Simon Moe Sørensen","Alireza Kashani","K. Eldjarn Hjorleifsson","Grethe Hyldig","Søren Hauberg","Serge Belongie","Frederik Warburg"],"pdf_url":"https://arxiv.org/pdf/2308.16900v2.pdf","comment":"Corrected a typo in author name"},{"id":"http://arxiv.org/abs/2309.03648v1","updated":"2023-09-07T11:29:16Z","published":"2023-09-07T11:29:16Z","title":"Characterizing Lipschitz Stability of GNN for Fairness","summary":" The Lipschitz bound, a technique from robust statistics, can limit the\nmaximum changes in the output concerning the input, taking into account\nassociated irrelevant biased factors. It is an efficient and provable method\nfor examining the output stability of machine learning models without incurring\nadditional computation costs. Recently, Graph Neural Networks (GNNs), which\noperate on non-Euclidean data, have gained significant attention. However, no\nprevious research has investigated the GNN Lipschitz bounds to shed light on\nstabilizing model outputs, especially when working on non-Euclidean data with\ninherent biases. Given the inherent biases in common graph data used for GNN\ntraining, it poses a serious challenge to constraining the GNN output\nperturbations induced by input biases, thereby safeguarding fairness during\ntraining. Recently, despite the Lipschitz constant's use in controlling the\nstability of Euclideanneural networks, the calculation of the precise Lipschitz\nconstant remains elusive for non-Euclidean neural networks like GNNs,\nespecially within fairness contexts. To narrow this gap, we begin with the\ngeneral GNNs operating on an attributed graph, and formulate a Lipschitz bound\nto limit the changes in the output regarding biases associated with the input.\nAdditionally, we theoretically analyze how the Lipschitz constant of a GNN\nmodel could constrain the output perturbations induced by biases learned from\ndata for fairness training. We experimentally validate the Lipschitz bound's\neffectiveness in limiting biases of the model output. Finally, from a training\ndynamics perspective, we demonstrate why the theoretical Lipschitz bound can\neffectively guide the GNN training to better trade-off between accuracy and\nfairness.\n","authors":["Yaning Jia","Chunhui Zhang","Jundong Li","Chuxu Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.03648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07853v2","updated":"2023-09-07T11:04:27Z","published":"2023-03-14T12:46:52Z","title":"ReFit: A Framework for Refinement of Weakly Supervised Semantic\n Segmentation using Object Border Fitting for Medical Images","summary":" Weakly Supervised Semantic Segmentation (WSSS) relying only on image-level\nsupervision is a promising approach to deal with the need for Segmentation\nnetworks, especially for generating a large number of pixel-wise masks in a\ngiven dataset. However, most state-of-the-art image-level WSSS techniques lack\nan understanding of the geometric features embedded in the images since the\nnetwork cannot derive any object boundary information from just image-level\nlabels. We define a boundary here as the line separating an object and its\nbackground, or two different objects. To address this drawback, we are\nproposing our novel ReFit framework, which deploys state-of-the-art class\nactivation maps combined with various post-processing techniques in order to\nachieve fine-grained higher-accuracy segmentation masks. To achieve this, we\ninvestigate a state-of-the-art unsupervised segmentation network that can be\nused to construct a boundary map, which enables ReFit to predict object\nlocations with sharper boundaries. By applying our method to WSSS predictions,\nwe achieved up to 10% improvement over the current state-of-the-art WSSS\nmethods for medical imaging. The framework is open-source, to ensure that our\nresults are reproducible, and accessible online at\nhttps://github.com/bharathprabakaran/ReFit.\n","authors":["Bharath Srinivas Prabakaran","Erik Ostrowski","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2303.07853v2.pdf","comment":"Accepted for Publication at the International Symposium on Visual\n Computing (ISVC), October 2023, Lake Tahoe, NV, USA"},{"id":"http://arxiv.org/abs/2309.03631v1","updated":"2023-09-07T10:54:06Z","published":"2023-09-07T10:54:06Z","title":"Insights Into the Inner Workings of Transformer Models for Protein\n Function Prediction","summary":" Motivation: We explored how explainable AI (XAI) can help to shed light into\nthe inner workings of neural networks for protein function prediction, by\nextending the widely used XAI method of integrated gradients such that latent\nrepresentations inside of transformer models, which were finetuned to Gene\nOntology term and Enzyme Commission number prediction, can be inspected too.\nResults: The approach enabled us to identify amino acids in the sequences that\nthe transformers pay particular attention to, and to show that these relevant\nsequence parts reflect expectations from biology and chemistry, both in the\nembedding layer and inside of the model, where we identified transformer heads\nwith a statistically significant correspondence of attribution maps with ground\ntruth sequence annotations (e.g., transmembrane regions, active sites) across\nmany proteins. Availability and Implementation: Source code can be accessed at\nhttps://github.com/markuswenzel/xai-proteins .\n","authors":["Markus Wenzel","Erik Grüner","Nils Strodthoff"],"pdf_url":"https://arxiv.org/pdf/2309.03631v1.pdf","comment":"20 pages, 9 figures, 4 tables, source code available at\n https://github.com/markuswenzel/xai-proteins"},{"id":"http://arxiv.org/abs/2306.12760v2","updated":"2023-09-07T10:30:10Z","published":"2023-06-22T09:34:55Z","title":"Blended-NeRF: Zero-Shot Object Generation and Blending in Existing\n Neural Radiance Fields","summary":" Editing a local region or a specific object in a 3D scene represented by a\nNeRF or consistently blending a new realistic object into the scene is\nchallenging, mainly due to the implicit nature of the scene representation. We\npresent Blended-NeRF, a robust and flexible framework for editing a specific\nregion of interest in an existing NeRF scene, based on text prompts, along with\na 3D ROI box. Our method leverages a pretrained language-image model to steer\nthe synthesis towards a user-provided text prompt, along with a 3D MLP model\ninitialized on an existing NeRF scene to generate the object and blend it into\na specified region in the original scene. We allow local editing by localizing\na 3D ROI box in the input scene, and blend the content synthesized inside the\nROI with the existing scene using a novel volumetric blending technique. To\nobtain natural looking and view-consistent results, we leverage existing and\nnew geometric priors and 3D augmentations for improving the visual fidelity of\nthe final result. We test our framework both qualitatively and quantitatively\non a variety of real 3D scenes and text prompts, demonstrating realistic\nmulti-view consistent results with much flexibility and diversity compared to\nthe baselines. Finally, we show the applicability of our framework for several\n3D editing applications, including adding new objects to a scene,\nremoving/replacing/altering existing objects, and texture conversion.\n","authors":["Ori Gordon","Omri Avrahami","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2306.12760v2.pdf","comment":"16 pages, 14 figures. Project page:\n https://www.vision.huji.ac.il/blended-nerf/"},{"id":"http://arxiv.org/abs/2309.03619v1","updated":"2023-09-07T10:23:59Z","published":"2023-09-07T10:23:59Z","title":"Understanding Self-Supervised Learning of Speech Representation via\n Invariance and Redundancy Reduction","summary":" The choice of the objective function is crucial in emerging high-quality\nrepresentations from self-supervised learning. This paper investigates how\ndifferent formulations of the Barlow Twins (BT) objective impact downstream\ntask performance for speech data. We propose Modified Barlow Twins (MBT) with\nnormalized latents to enforce scale-invariance and evaluate on speaker\nidentification, gender recognition and keyword spotting tasks. Our results show\nMBT improves representation generalization over original BT, especially when\nfine-tuning with limited target data. This highlights the importance of\ndesigning objectives that encourage invariant and transferable representations.\nOur analysis provides insights into how the BT learning objective can be\ntailored to produce speech representations that excel when adapted to new\ndownstream tasks. This study is an important step towards developing reusable\nself-supervised speech representations.\n","authors":["Yusuf Brima","Ulf Krumnack","Simone Pika","Gunther Heidemann"],"pdf_url":"https://arxiv.org/pdf/2309.03619v1.pdf","comment":"6 pages, 1 figure, in submission to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.03616v1","updated":"2023-09-07T10:18:36Z","published":"2023-09-07T10:18:36Z","title":"Filtration Surfaces for Dynamic Graph Classification","summary":" Existing approaches for classifying dynamic graphs either lift graph kernels\nto the temporal domain, or use graph neural networks (GNNs). However, current\nbaselines have scalability issues, cannot handle a changing node set, or do not\ntake edge weight information into account. We propose filtration surfaces, a\nnovel method that is scalable and flexible, to alleviate said restrictions. We\nexperimentally validate the efficacy of our model and show that filtration\nsurfaces outperform previous state-of-the-art baselines on datasets that rely\non edge weight information. Our method does so while being either completely\nparameter-free or having at most one parameter, and yielding the lowest overall\nstandard deviation.\n","authors":["Franz Srambical","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2309.03616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03607v1","updated":"2023-09-07T10:02:59Z","published":"2023-09-07T10:02:59Z","title":"Your Battery Is a Blast! Safeguarding Against Counterfeit Batteries with\n Authentication","summary":" Lithium-ion (Li-ion) batteries are the primary power source in various\napplications due to their high energy and power density. Their market was\nestimated to be up to 48 billion U.S. dollars in 2022. However, the widespread\nadoption of Li-ion batteries has resulted in counterfeit cell production, which\ncan pose safety hazards to users. Counterfeit cells can cause explosions or\nfires, and their prevalence in the market makes it difficult for users to\ndetect fake cells. Indeed, current battery authentication methods can be\nsusceptible to advanced counterfeiting techniques and are often not adaptable\nto various cells and systems. In this paper, we improve the state of the art on\nbattery authentication by proposing two novel methodologies, DCAuth and\nEISthentication, which leverage the internal characteristics of each cell\nthrough Machine Learning models. Our methods automatically authenticate\nlithium-ion battery models and architectures using data from their regular\nusage without the need for any external device. They are also resilient to the\nmost common and critical counterfeit practices and can scale to several\nbatteries and devices. To evaluate the effectiveness of our proposed\nmethodologies, we analyze time-series data from a total of 20 datasets that we\nhave processed to extract meaningful features for our analysis. Our methods\nachieve high accuracy in battery authentication for both architectures (up to\n0.99) and models (up to 0.96). Moreover, our methods offer comparable\nidentification performances. By using our proposed methodologies, manufacturers\ncan ensure that devices only use legitimate batteries, guaranteeing the\noperational state of any system and safety measures for the users.\n","authors":["Francesco Marchiori","Mauro Conti"],"pdf_url":"https://arxiv.org/pdf/2309.03607v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.03581v1","updated":"2023-09-07T09:22:05Z","published":"2023-09-07T09:22:05Z","title":"Interactive Hyperparameter Optimization in Multi-Objective Problems via\n Preference Learning","summary":" Hyperparameter optimization (HPO) is important to leverage the full potential\nof machine learning (ML). In practice, users are often interested in\nmulti-objective (MO) problems, i.e., optimizing potentially conflicting\nobjectives, like accuracy and energy consumption. To tackle this, the vast\nmajority of MO-ML algorithms return a Pareto front of non-dominated machine\nlearning models to the user. Optimizing the hyperparameters of such algorithms\nis non-trivial as evaluating a hyperparameter configuration entails evaluating\nthe quality of the resulting Pareto front. In literature, there are known\nindicators that assess the quality of a Pareto front (e.g., hypervolume, R2) by\nquantifying different properties (e.g., volume, proximity to a reference\npoint). However, choosing the indicator that leads to the desired Pareto front\nmight be a hard task for a user. In this paper, we propose a human-centered\ninteractive HPO approach tailored towards multi-objective ML leveraging\npreference learning to extract desiderata from users that guide the\noptimization. Instead of relying on the user guessing the most suitable\nindicator for their needs, our approach automatically learns an appropriate\nindicator. Concretely, we leverage pairwise comparisons of distinct Pareto\nfronts to learn such an appropriate quality indicator. Then, we optimize the\nhyperparameters of the underlying MO-ML algorithm towards this learned\nindicator using a state-of-the-art HPO approach. In an experimental study\ntargeting the environmental impact of ML, we demonstrate that our approach\nleads to substantially better Pareto fronts compared to optimizing based on a\nwrong indicator pre-selected by the user, and performs comparable in the case\nof an advanced user knowing which indicator to pick.\n","authors":["Joseph Giovanelli","Alexander Tornede","Tanja Tornede","Marius Lindauer"],"pdf_url":"https://arxiv.org/pdf/2309.03581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03579v1","updated":"2023-09-07T09:18:12Z","published":"2023-09-07T09:18:12Z","title":"DTW+S: Shape-based Comparison of Time-series with Ordered Local Trend","summary":" Measuring distance or similarity between time-series data is a fundamental\naspect of many applications including classification and clustering. Existing\nmeasures may fail to capture similarities due to local trends (shapes) and may\neven produce misleading results. Our goal is to develop a measure that looks\nfor similar trends occurring around similar times and is easily interpretable\nfor researchers in applied domains. This is particularly useful for\napplications where time-series have a sequence of meaningful local trends that\nare ordered, such as in epidemics (a surge to an increase to a peak to a\ndecrease). We propose a novel measure, DTW+S, which creates an interpretable\n\"closeness-preserving\" matrix representation of the time-series, where each\ncolumn represents local trends, and then it applies Dynamic Time Warping to\ncompute distances between these matrices. We present a theoretical analysis\nthat supports the choice of this representation. We demonstrate the utility of\nDTW+S in ensemble building and clustering of epidemic curves. We also\ndemonstrate that our approach results in better classification compared to\nDynamic Time Warping for a class of datasets, particularly when local trends\nrather than scale play a decisive role.\n","authors":["Ajitesh Srivastava"],"pdf_url":"https://arxiv.org/pdf/2309.03579v1.pdf","comment":"11 pages, 13 figures"},{"id":"http://arxiv.org/abs/2304.09479v3","updated":"2023-09-07T09:08:01Z","published":"2023-04-19T08:03:20Z","title":"DiFaReli: Diffusion Face Relighting","summary":" We present a novel approach to single-view face relighting in the wild.\nHandling non-diffuse effects, such as global illumination or cast shadows, has\nlong been a challenge in face relighting. Prior work often assumes Lambertian\nsurfaces, simplified lighting models or involves estimating 3D shape, albedo,\nor a shadow map. This estimation, however, is error-prone and requires many\ntraining examples with lighting ground truth to generalize well. Our work\nbypasses the need for accurate estimation of intrinsic components and can be\ntrained solely on 2D images without any light stage data, multi-view images, or\nlighting ground truth. Our key idea is to leverage a conditional diffusion\nimplicit model (DDIM) for decoding a disentangled light encoding along with\nother encodings related to 3D shape and facial identity inferred from\noff-the-shelf estimators. We also propose a novel conditioning technique that\neases the modeling of the complex interaction between light and geometry by\nusing a rendered shading reference to spatially modulate the DDIM. We achieve\nstate-of-the-art performance on standard benchmark Multi-PIE and can\nphotorealistically relight in-the-wild images. Please visit our page:\nhttps://diffusion-face-relighting.github.io\n","authors":["Puntawat Ponglertnapakorn","Nontawat Tritrong","Supasorn Suwajanakorn"],"pdf_url":"https://arxiv.org/pdf/2304.09479v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.03569v1","updated":"2023-09-07T08:58:41Z","published":"2023-09-07T08:58:41Z","title":"Sparse Federated Training of Object Detection in the Internet of\n Vehicles","summary":" As an essential component part of the Intelligent Transportation System\n(ITS), the Internet of Vehicles (IoV) plays a vital role in alleviating traffic\nissues. Object detection is one of the key technologies in the IoV, which has\nbeen widely used to provide traffic management services by analyzing timely and\nsensitive vehicle-related information. However, the current object detection\nmethods are mostly based on centralized deep training, that is, the sensitive\ndata obtained by edge devices need to be uploaded to the server, which raises\nprivacy concerns. To mitigate such privacy leakage, we first propose a\nfederated learning-based framework, where well-trained local models are shared\nin the central server. However, since edge devices usually have limited\ncomputing power, plus a strict requirement of low latency in IoVs, we further\npropose a sparse training process on edge devices, which can effectively\nlighten the model, and ensure its training efficiency on edge devices, thereby\nreducing communication overheads. In addition, due to the diverse computing\ncapabilities and dynamic environment, different sparsity rates are applied to\nedge devices. To further guarantee the performance, we propose, FedWeg, an\nimproved aggregation scheme based on FedAvg, which is designed by the inverse\nratio of sparsity rates. Experiments on the real-life dataset using YOLO show\nthat the proposed scheme can achieve the required object detection rate while\nsaving considerable communication costs.\n","authors":["Luping Rao","Chuan Ma","Ming Ding","Yuwen Qian","Lu Zhou","Zhe Liu"],"pdf_url":"https://arxiv.org/pdf/2309.03569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16193v3","updated":"2023-09-07T08:57:10Z","published":"2022-10-27T16:59:39Z","title":"M3FGM:a node masking and multi-granularity message passing-based\n federated graph model for spatial-temporal data prediction","summary":" Researchers are solving the challenges of spatial-temporal prediction by\ncombining Federated Learning (FL) and graph models with respect to the\nconstrain of privacy and security. In order to make better use of the power of\ngraph model, some researchs also combine split learning(SL). However, there are\nstill several issues left unattended: 1) Clients might not be able to access\nthe server during inference phase; 2) The graph of clients designed manually in\nthe server model may not reveal the proper relationship between clients. This\npaper proposes a new GNN-oriented split federated learning method, named node\n{\\bfseries M}asking and {\\bfseries M}ulti-granularity {\\bfseries M}essage\npassing-based Federated Graph Model (M$^3$FGM) for the above issues. For the\nfirst issue, the server model of M$^3$FGM employs a MaskNode layer to simulate\nthe case of clients being offline. We also redesign the decoder of the client\nmodel using a dual-sub-decoders structure so that each client model can use its\nlocal data to predict independently when offline. As for the second issue, a\nnew GNN layer named Multi-Granularity Message Passing (MGMP) layer enables each\nclient node to perceive global and local information. We conducted extensive\nexperiments in two different scenarios on two real traffic datasets. Results\nshow that M$^3$FGM outperforms the baselines and variant models, achieves the\nbest results in both datasets and scenarios.\n","authors":["Yuxing Tian","Zheng Liu","Yanwen Qu","Song Li","Jiachi Luo"],"pdf_url":"https://arxiv.org/pdf/2210.16193v3.pdf","comment":"Accepted by ICONIP2023"},{"id":"http://arxiv.org/abs/2309.03564v1","updated":"2023-09-07T08:50:46Z","published":"2023-09-07T08:50:46Z","title":"Evaluating the Efficacy of Supervised Learning vs Large Language Models\n for Identifying Cognitive Distortions and Suicidal Risks in Chinese Social\n Media","summary":" Large language models, particularly those akin to the rapidly progressing GPT\nseries, are gaining traction for their expansive influence. While there is keen\ninterest in their applicability within medical domains such as psychology,\ntangible explorations on real-world data remain scant. Concurrently, users on\nsocial media platforms are increasingly vocalizing personal sentiments; under\nspecific thematic umbrellas, these sentiments often manifest as negative\nemotions, sometimes escalating to suicidal inclinations. Timely discernment of\nsuch cognitive distortions and suicidal risks is crucial to effectively\nintervene and potentially avert dire circumstances. Our study ventured into\nthis realm by experimenting on two pivotal tasks: suicidal risk and cognitive\ndistortion identification on Chinese social media platforms. Using supervised\nlearning as a baseline, we examined and contrasted the efficacy of large\nlanguage models via three distinct strategies: zero-shot, few-shot, and\nfine-tuning. Our findings revealed a discernible performance gap between the\nlarge language models and traditional supervised learning approaches, primarily\nattributed to the models' inability to fully grasp subtle categories. Notably,\nwhile GPT-4 outperforms its counterparts in multiple scenarios, GPT-3.5 shows\nsignificant enhancement in suicide risk classification after fine-tuning. To\nour knowledge, this investigation stands as the maiden attempt at gauging large\nlanguage models on Chinese social media tasks. This study underscores the\nforward-looking and transformative implications of using large language models\nin the field of psychology. It lays the groundwork for future applications in\npsychological research and practice.\n","authors":["Hongzhi Qi","Qing Zhao","Changwei Song","Wei Zhai","Dan Luo","Shuo Liu","Yi Jing Yu","Fan Wang","Huijing Zou","Bing Xiang Yang","Jianqiang Li","Guanghui Fu"],"pdf_url":"https://arxiv.org/pdf/2309.03564v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2309.03561v1","updated":"2023-09-07T08:44:25Z","published":"2023-09-07T08:44:25Z","title":"Trinary Decision Trees for missing value handling","summary":" This paper introduces the Trinary decision tree, an algorithm designed to\nimprove the handling of missing data in decision tree regressors and\nclassifiers. Unlike other approaches, the Trinary decision tree does not assume\nthat missing values contain any information about the response. Both\ntheoretical calculations on estimator bias and numerical illustrations using\nreal data sets are presented to compare its performance with established\nalgorithms in different missing data scenarios (Missing Completely at Random\n(MCAR), and Informative Missingness (IM)). Notably, the Trinary tree\noutperforms its peers in MCAR settings, especially when data is only missing\nout-of-sample, while lacking behind in IM settings. A hybrid model, the\nTrinaryMIA tree, which combines the Trinary tree and the Missing In Attributes\n(MIA) approach, shows robust performance in all types of missingness. Despite\nthe potential drawback of slower training speed, the Trinary tree offers a\npromising and more accurate method of handling missing data in decision tree\nalgorithms.\n","authors":["Henning Zakrisson"],"pdf_url":"https://arxiv.org/pdf/2309.03561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03557v1","updated":"2023-09-07T08:39:53Z","published":"2023-09-07T08:39:53Z","title":"On the dynamics of multi agent nonlinear filtering and learning","summary":" Multiagent systems aim to accomplish highly complex learning tasks through\ndecentralised consensus seeking dynamics and their use has garnered a great\ndeal of attention in the signal processing and computational intelligence\nsocieties. This article examines the behaviour of multiagent networked systems\nwith nonlinear filtering/learning dynamics. To this end, a general formulation\nfor the actions of an agent in multiagent networked systems is presented and\nconditions for achieving a cohesive learning behaviour is given. Importantly,\napplication of the so derived framework in distributed and federated learning\nscenarios are presented.\n","authors":["Sayed Pouria Talebi","Danilo Mandic"],"pdf_url":"https://arxiv.org/pdf/2309.03557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2002.01444v5","updated":"2023-09-07T08:32:50Z","published":"2020-02-04T18:08:49Z","title":"Proper Learning of Linear Dynamical Systems as a Non-Commutative\n Polynomial Optimisation Problem","summary":" There has been much recent progress in forecasting the next observation of a\nlinear dynamical system (LDS), which is known as the improper learning, as well\nas in the estimation of its system matrices, which is known as the proper\nlearning of LDS. We present an approach to proper learning of LDS, which in\nspite of the non-convexity of the problem, guarantees global convergence of\nnumerical solutions to a least-squares estimator. We present promising\ncomputational results.\n","authors":["Quan Zhou","Jakub Marecek"],"pdf_url":"https://arxiv.org/pdf/2002.01444v5.pdf","comment":"14 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.03190v2","updated":"2023-09-07T08:28:29Z","published":"2023-09-06T17:53:31Z","title":"Blink: Link Local Differential Privacy in Graph Neural Networks via\n Bayesian Estimation","summary":" Graph neural networks (GNNs) have gained an increasing amount of popularity\ndue to their superior capability in learning node embeddings for various graph\ninference tasks, but training them can raise privacy concerns. To address this,\nwe propose using link local differential privacy over decentralized nodes,\nenabling collaboration with an untrusted server to train GNNs without revealing\nthe existence of any link. Our approach spends the privacy budget separately on\nlinks and degrees of the graph for the server to better denoise the graph\ntopology using Bayesian estimation, alleviating the negative impact of LDP on\nthe accuracy of the trained GNNs. We bound the mean absolute error of the\ninferred link probabilities against the ground truth graph topology. We then\npropose two variants of our LDP mechanism complementing each other in different\nprivacy settings, one of which estimates fewer links under lower privacy\nbudgets to avoid false positive link estimates when the uncertainty is high,\nwhile the other utilizes more information and performs better given relatively\nhigher privacy budgets. Furthermore, we propose a hybrid variant that combines\nboth strategies and is able to perform better across different privacy budgets.\nExtensive experiments show that our approach outperforms existing methods in\nterms of accuracy under varying privacy budgets.\n","authors":["Xiaochen Zhu","Vincent Y. F. Tan","Xiaokui Xiao"],"pdf_url":"https://arxiv.org/pdf/2309.03190v2.pdf","comment":"17 pages, accepted by ACM CCS 2023 as a conference paper"},{"id":"http://arxiv.org/abs/2306.07019v2","updated":"2023-09-07T08:25:19Z","published":"2023-06-12T10:46:31Z","title":"Dynamic Causal Graph Convolutional Network for Traffic Prediction","summary":" Modeling complex spatiotemporal dependencies in correlated traffic series is\nessential for traffic prediction. While recent works have shown improved\nprediction performance by using neural networks to extract spatiotemporal\ncorrelations, their effectiveness depends on the quality of the graph\nstructures used to represent the spatial topology of the traffic network. In\nthis work, we propose a novel approach for traffic prediction that embeds\ntime-varying dynamic Bayesian network to capture the fine spatiotemporal\ntopology of traffic data. We then use graph convolutional networks to generate\ntraffic forecasts. To enable our method to efficiently model nonlinear traffic\npropagation patterns, we develop a deep learning-based module as a\nhyper-network to generate stepwise dynamic causal graphs. Our experimental\nresults on a real traffic dataset demonstrate the superior prediction\nperformance of the proposed method. The code is available at\nhttps://github.com/MonBG/DCGCN.\n","authors":["Junpeng Lin","Ziyue Li","Zhishuai Li","Lei Bai","Rui Zhao","Chen Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.07019v2.pdf","comment":"Accepted to IEEE CASE 2023; Peter Luh Best Memorial Award for Young\n Researcher (Finalist)"},{"id":"http://arxiv.org/abs/2309.03544v1","updated":"2023-09-07T08:02:57Z","published":"2023-09-07T08:02:57Z","title":"MVD:A Novel Methodology and Dataset for Acoustic Vehicle Type\n Classification","summary":" Rising urban populations have led to a surge in vehicle use and made traffic\nmonitoring and management indispensable. Acoustic traffic monitoring (ATM)\noffers a cost-effective and efficient alternative to more computationally\nexpensive methods of monitoring traffic such as those involving computer vision\ntechnologies. In this paper, we present MVD and MVDA: two open datasets for the\ndevelopment of acoustic traffic monitoring and vehicle-type classification\nalgorithms, which contain audio recordings of moving vehicles. The dataset\ncontain four classes- Trucks, Cars, Motorbikes, and a No-vehicle class.\nAdditionally, we propose a novel and efficient way to accurately classify these\nacoustic signals using cepstrum and spectrum based local and global audio\nfeatures, and a multi-input neural network. Experimental results show that our\nmethodology improves upon the established baselines of previous works and\nachieves an accuracy of 91.98% and 96.66% on MVD and MVDA Datasets,\nrespectively. Finally, the proposed model was deployed through an Android\napplication to make it accessible for testing and demonstrate its efficacy.\n","authors":["Mohd Ashhad","Omar Ahmed","Sooraj K. Ambat","Zeeshan Ali Haq","Mansaf Alam"],"pdf_url":"https://arxiv.org/pdf/2309.03544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03537v1","updated":"2023-09-07T07:49:43Z","published":"2023-09-07T07:49:43Z","title":"Subgraph-based Tight Frames on Graphs with Compact Supports and\n Vanishing Moments","summary":" In this work, we proposed a novel and general method to construct tight\nframes on graphs with compact supports based on a series of hierarchical\npartitions. Starting from our abstract construction that generalizes previous\nmethods based on partition trees, we are able to flexibly incorporate subgraph\nLaplacians into our design of graph frames. Consequently, our general methods\npermit adjusting the (subgraph) vanishing moments of the framelets and extra\nproperties, such as directionality, for efficiently representing graph signals\nwith path-like supports. Several variants are explicitly defined and tested.\nExperimental results show our proposed graph frames perform superiorly in\nnon-linear approximation tasks.\n","authors":["Ruigang Zheng","Xiaosheng Zhuang"],"pdf_url":"https://arxiv.org/pdf/2309.03537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03535v1","updated":"2023-09-07T07:46:46Z","published":"2023-09-07T07:46:46Z","title":"Feature Enhancer Segmentation Network (FES-Net) for Vessel Segmentation","summary":" Diseases such as diabetic retinopathy and age-related macular degeneration\npose a significant risk to vision, highlighting the importance of precise\nsegmentation of retinal vessels for the tracking and diagnosis of progression.\nHowever, existing vessel segmentation methods that heavily rely on\nencoder-decoder structures struggle to capture contextual information about\nretinal vessel configurations, leading to challenges in reconciling semantic\ndisparities between encoder and decoder features. To address this, we propose a\nnovel feature enhancement segmentation network (FES-Net) that achieves accurate\npixel-wise segmentation without requiring additional image enhancement steps.\nFES-Net directly processes the input image and utilizes four prompt\nconvolutional blocks (PCBs) during downsampling, complemented by a shallow\nupsampling approach to generate a binary mask for each class. We evaluate the\nperformance of FES-Net on four publicly available state-of-the-art datasets:\nDRIVE, STARE, CHASE, and HRF. The evaluation results clearly demonstrate the\nsuperior performance of FES-Net compared to other competitive approaches\ndocumented in the existing literature.\n","authors":["Tariq M. Khan","Muhammad Arsalan","Shahzaib Iqbal","Imran Razzak","Erik Meijering"],"pdf_url":"https://arxiv.org/pdf/2309.03535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02335v2","updated":"2023-09-07T07:46:16Z","published":"2023-08-04T14:06:44Z","title":"RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph\n Classification","summary":" Graph classification is a crucial task in many real-world multimedia\napplications, where graphs can represent various multimedia data types such as\nimages, videos, and social networks. Previous efforts have applied graph neural\nnetworks (GNNs) in balanced situations where the class distribution is\nbalanced. However, real-world data typically exhibit long-tailed class\ndistributions, resulting in a bias towards the head classes when using GNNs and\nlimited generalization ability over the tail classes. Recent approaches mainly\nfocus on re-balancing different classes during model training, which fails to\nexplicitly introduce new knowledge and sacrifices the performance of the head\nclasses. To address these drawbacks, we propose a novel framework called\nRetrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature\nextractor and an unbiased classifier in a decoupled manner. In the feature\nextractor training stage, we develop a graph retrieval module to search for\nrelevant graphs that directly enrich the intra-class diversity for the tail\nclasses. Moreover, we innovatively optimize a category-centered supervised\ncontrastive loss to obtain discriminative representations, which is more\nsuitable for long-tailed scenarios. In the classifier fine-tuning stage, we\nbalance the classifier weights with two weight regularization techniques, i.e.,\nMax-norm and weight decay. Experiments on various popular benchmarks verify the\nsuperiority of the proposed method against state-of-the-art approaches.\n","authors":["Zhengyang Mao","Wei Ju","Yifang Qin","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.02335v2.pdf","comment":"Accepted by the ACM International Conference on Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2306.12774v2","updated":"2023-09-07T07:42:24Z","published":"2023-06-22T10:00:33Z","title":"Pure Exploration in Bandits with Linear Constraints","summary":" We address the problem of identifying the optimal policy with a fixed\nconfidence level in a multi-armed bandit setup, when \\emph{the arms are subject\nto linear constraints}. Unlike the standard best-arm identification problem\nwhich is well studied, the optimal policy in this case may not be deterministic\nand could mix between several arms. This changes the geometry of the problem\nwhich we characterize via an information-theoretic lower bound. We introduce\ntwo asymptotically optimal algorithms for this setting, one based on the\nTrack-and-Stop method and the other based on a game-theoretic approach. Both\nthese algorithms try to track an optimal allocation based on the lower bound\nand computed by a weighted projection onto the boundary of a normal cone.\nFinally, we provide empirical results that validate our bounds and visualize\nhow constraints change the hardness of the problem.\n","authors":["Emil Carlsson","Debabrota Basu","Fredrik D. Johansson","Devdatt Dubhashi"],"pdf_url":"https://arxiv.org/pdf/2306.12774v2.pdf","comment":"EWRL16"},{"id":"http://arxiv.org/abs/2309.03531v1","updated":"2023-09-07T07:26:27Z","published":"2023-09-07T07:26:27Z","title":"A Robust Negative Learning Approach to Partial Domain Adaptation Using\n Source Prototypes","summary":" This work proposes a robust Partial Domain Adaptation (PDA) framework that\nmitigates the negative transfer problem by incorporating a robust\ntarget-supervision strategy. It leverages ensemble learning and includes\ndiverse, complementary label feedback, alleviating the effect of incorrect\nfeedback and promoting pseudo-label refinement. Rather than relying exclusively\non first-order moments for distribution alignment, our approach offers explicit\nobjectives to optimize intra-class compactness and inter-class separation with\nthe inferred source prototypes and highly-confident target samples in a\ndomain-invariant fashion. Notably, we ensure source data privacy by eliminating\nthe need to access the source data during the adaptation phase through a priori\ninference of source prototypes. We conducted a series of comprehensive\nexperiments, including an ablation analysis, covering a range of partial domain\nadaptation tasks. Comprehensive evaluations on benchmark datasets corroborate\nour framework's enhanced robustness and generalization, demonstrating its\nsuperiority over existing state-of-the-art PDA approaches.\n","authors":["Sandipan Choudhuri","Suli Adeniye","Arunabha Sen"],"pdf_url":"https://arxiv.org/pdf/2309.03531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03530v1","updated":"2023-09-07T07:23:55Z","published":"2023-09-07T07:23:55Z","title":"Efficient Single Object Detection on Image Patches with Early Exit\n Enhanced High-Precision CNNs","summary":" This paper proposes a novel approach for detecting objects using mobile\nrobots in the context of the RoboCup Standard Platform League, with a primary\nfocus on detecting the ball. The challenge lies in detecting a dynamic object\nin varying lighting conditions and blurred images caused by fast movements. To\naddress this challenge, the paper presents a convolutional neural network\narchitecture designed specifically for computationally constrained robotic\nplatforms. The proposed CNN is trained to achieve high precision classification\nof single objects in image patches and to determine their precise spatial\npositions. The paper further integrates Early Exits into the existing\nhigh-precision CNN architecture to reduce the computational cost of easily\nrejectable cases in the background class. The training process involves a\ncomposite loss function based on confidence and positional losses with dynamic\nweighting and data augmentation. The proposed approach achieves a precision of\n100% on the validation dataset and a recall of almost 87%, while maintaining an\nexecution time of around 170 $\\mu$s per hypotheses. By combining the proposed\napproach with an Early Exit, a runtime optimization of more than 28%, on\naverage, can be achieved compared to the original CNN. Overall, this paper\nprovides an efficient solution for an enhanced detection of objects, especially\nthe ball, in computationally constrained robotic platforms.\n","authors":["Arne Moos"],"pdf_url":"https://arxiv.org/pdf/2309.03530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.03680v2","updated":"2023-09-07T07:11:59Z","published":"2022-08-07T09:02:18Z","title":"Accelerating Numerical Solvers for Large-Scale Simulation of Dynamical\n System via NeurVec","summary":" The large-scale simulation of dynamical systems is critical in numerous\nscientific and engineering disciplines. However, traditional numerical solvers\nare limited by the choice of step sizes when estimating integration, resulting\nin a trade-off between accuracy and computational efficiency. To address this\nchallenge, we introduce a deep learning-based corrector called Neural Vector\n(NeurVec), which can compensate for integration errors and enable larger time\nstep sizes in simulations. Our extensive experiments on a variety of complex\ndynamical system benchmarks demonstrate that NeurVec exhibits remarkable\ngeneralization capability on a continuous phase space, even when trained using\nlimited and discrete data. NeurVec significantly accelerates traditional\nsolvers, achieving speeds tens to hundreds of times faster while maintaining\nhigh levels of accuracy and stability. Moreover, NeurVec's simple-yet-effective\ndesign, combined with its ease of implementation, has the potential to\nestablish a new paradigm for fast-solving differential equations based on deep\nlearning.\n","authors":["Zhongzhan Huang","Senwei Liang","Hong Zhang","Haizhao Yang","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2208.03680v2.pdf","comment":"Accepted by Scientific Report"},{"id":"http://arxiv.org/abs/2307.09882v2","updated":"2023-09-07T06:56:25Z","published":"2023-07-19T10:26:29Z","title":"Adversarial Likelihood Estimation With One-Way Flows","summary":" Generative Adversarial Networks (GANs) can produce high-quality samples, but\ndo not provide an estimate of the probability density around the samples.\nHowever, it has been noted that maximizing the log-likelihood within an\nenergy-based setting can lead to an adversarial framework where the\ndiscriminator provides unnormalized density (often called energy). We further\ndevelop this perspective, incorporate importance sampling, and show that 1)\nWasserstein GAN performs a biased estimate of the partition function, and we\npropose instead to use an unbiased estimator; and 2) when optimizing for\nlikelihood, one must maximize generator entropy. This is hypothesized to\nprovide a better mode coverage. Different from previous works, we explicitly\ncompute the density of the generated samples. This is the key enabler to\ndesigning an unbiased estimator of the partition function and computation of\nthe generator entropy term. The generator density is obtained via a new type of\nflow network, called one-way flow network, that is less constrained in terms of\narchitecture, as it does not require a tractable inverse function. Our\nexperimental results show that our method converges faster, produces comparable\nsample quality to GANs with similar architecture, successfully avoids\nover-fitting to commonly used datasets and produces smooth low-dimensional\nlatent representations of the training data.\n","authors":["Omri Ben-Dov","Pravir Singh Gupta","Victoria Abrevaya","Michael J. Black","Partha Ghosh"],"pdf_url":"https://arxiv.org/pdf/2307.09882v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09183v2","updated":"2023-09-07T06:41:21Z","published":"2023-08-17T20:54:39Z","title":"RatGPT: Turning online LLMs into Proxies for Malware Attacks","summary":" The evolution of Generative AI and the capabilities of the newly released\nLarge Language Models (LLMs) open new opportunities in software engineering.\nHowever, they also lead to new challenges in cybersecurity. Recently,\nresearchers have shown the possibilities of using LLMs such as ChatGPT to\ngenerate malicious content that can directly be exploited or guide\ninexperienced hackers to weaponize tools and code. These studies covered\nscenarios that still require the attacker to be in the middle of the loop. In\nthis study, we leverage openly available plugins and use an LLM as proxy\nbetween the attacker and the victim. We deliver a proof-of-concept where\nChatGPT is used for the dissemination of malicious software while evading\ndetection, alongside establishing the communication to a command and control\n(C2) server to receive commands to interact with a victim's system. Finally, we\npresent the general approach as well as essential elements in order to stay\nundetected and make the attack a success. This proof-of-concept highlights\nsignificant cybersecurity issues with openly available plugins and LLMs, which\nrequire the development of security guidelines, controls, and mitigation\nstrategies.\n","authors":["Mika Beckerich","Laura Plein","Sergio Coronado"],"pdf_url":"https://arxiv.org/pdf/2308.09183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09241v4","updated":"2023-09-07T06:34:37Z","published":"2023-05-16T07:40:05Z","title":"Unlearnable Examples Give a False Sense of Security: Piercing through\n Unexploitable Data with Learnable Examples","summary":" Safeguarding data from unauthorized exploitation is vital for privacy and\nsecurity, especially in recent rampant research in security breach such as\nadversarial/membership attacks. To this end, \\textit{unlearnable examples}\n(UEs) have been recently proposed as a compelling protection, by adding\nimperceptible perturbation to data so that models trained on them cannot\nclassify them accurately on original clean distribution. Unfortunately, we find\nUEs provide a false sense of security, because they cannot stop unauthorized\nusers from utilizing other unprotected data to remove the protection, by\nturning unlearnable data into learnable again. Motivated by this observation,\nwe formally define a new threat by introducing \\textit{learnable unauthorized\nexamples} (LEs) which are UEs with their protection removed. The core of this\napproach is a novel purification process that projects UEs onto the manifold of\nLEs. This is realized by a new joint-conditional diffusion model which denoises\nUEs conditioned on the pixel and perceptual similarity between UEs and LEs.\nExtensive experiments demonstrate that LE delivers state-of-the-art countering\nperformance against both supervised UEs and unsupervised UEs in various\nscenarios, which is the first generalizable countermeasure to UEs across\nsupervised learning and unsupervised learning. Our code is available at\n\\url{https://github.com/jiangw-0/LE_JCDP}.\n","authors":["Wan Jiang","Yunfeng Diao","He Wang","Jianxin Sun","Meng Wang","Richang Hong"],"pdf_url":"https://arxiv.org/pdf/2305.09241v4.pdf","comment":"Accepted in MM 2023"},{"id":"http://arxiv.org/abs/2308.15116v2","updated":"2023-09-07T06:16:33Z","published":"2023-08-29T08:29:08Z","title":"Mixup-Augmented Meta-Learning for Sample-Efficient Fine-Tuning of\n Protein Simulators","summary":" Molecular dynamics simulations have emerged as a fundamental instrument for\nstudying biomolecules. At the same time, it is desirable to perform simulations\nof a collection of particles under various conditions in which the molecules\ncan fluctuate. In this paper, we explore and adapt the soft prompt-based\nlearning method to molecular dynamics tasks. Our model can remarkably\ngeneralize to unseen and out-of-distribution scenarios with limited training\ndata. While our work focuses on temperature as a test case, the versatility of\nour approach allows for efficient simulation through any continuous dynamic\nconditions, such as pressure and volumes. Our framework has two stages: 1)\nPre-trains with data mixing technique, augments molecular structure data and\ntemperature prompts, then applies a curriculum learning method by increasing\nthe ratio of them smoothly. 2) Meta-learning-based fine-tuning framework\nimproves sample-efficiency of fine-tuning process and gives the soft\nprompt-tuning better initialization points. Comprehensive experiments reveal\nthat our framework excels in accuracy for in-domain data and demonstrates\nstrong generalization capabilities for unseen and out-of-distribution samples.\n","authors":["Jingbang Chen","Yian Wang","Xingwei Qu","Shuangjia Zheng","Yaodong Yang","Hao Dong","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2308.15116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03487v1","updated":"2023-09-07T05:45:47Z","published":"2023-09-07T05:45:47Z","title":"Privacy-preserving Continual Federated Clustering via Adaptive Resonance\n Theory","summary":" With the increasing importance of data privacy protection, various\nprivacy-preserving machine learning methods have been proposed. In the\nclustering domain, various algorithms with a federated learning framework\n(i.e., federated clustering) have been actively studied and showed high\nclustering performance while preserving data privacy. However, most of the base\nclusterers (i.e., clustering algorithms) used in existing federated clustering\nalgorithms need to specify the number of clusters in advance. These algorithms,\ntherefore, are unable to deal with data whose distributions are unknown or\ncontinually changing. To tackle this problem, this paper proposes a\nprivacy-preserving continual federated clustering algorithm. In the proposed\nalgorithm, an adaptive resonance theory-based clustering algorithm capable of\ncontinual learning is used as a base clusterer. Therefore, the proposed\nalgorithm inherits the ability of continual learning. Experimental results with\nsynthetic and real-world datasets show that the proposed algorithm has superior\nclustering performance to state-of-the-art federated clustering algorithms\nwhile realizing data privacy protection and continual learning ability. The\nsource code is available at \\url{https://github.com/Masuyama-lab/FCAC}.\n","authors":["Naoki Masuyama","Yusuke Nojima","Yuichiro Toda","Chu Kiong Loo","Hisao Ishibuchi","Naoyuki Kubota"],"pdf_url":"https://arxiv.org/pdf/2309.03487v1.pdf","comment":"This paper is currently under review. arXiv admin note: substantial\n text overlap with arXiv:2305.01507"},{"id":"http://arxiv.org/abs/2307.14971v2","updated":"2023-09-07T05:44:37Z","published":"2023-07-27T16:07:03Z","title":"Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models","summary":" With the overwhelming trend of mask image modeling led by MAE, generative\npre-training has shown a remarkable potential to boost the performance of\nfundamental models in 2D vision. However, in 3D vision, the over-reliance on\nTransformer-based backbones and the unordered nature of point clouds have\nrestricted the further development of generative pre-training. In this paper,\nwe propose a novel 3D-to-2D generative pre-training method that is adaptable to\nany point cloud model. We propose to generate view images from different\ninstructed poses via the cross-attention mechanism as the pre-training scheme.\nGenerating view images has more precise supervision than its point cloud\ncounterpart, thus assisting 3D backbones to have a finer comprehension of the\ngeometrical structure and stereoscopic relations of the point cloud.\nExperimental results have proved the superiority of our proposed 3D-to-2D\ngenerative pre-training over previous pre-training methods. Our method is also\neffective in boosting the performance of architecture-oriented approaches,\nachieving state-of-the-art performance when fine-tuning on ScanObjectNN\nclassification and ShapeNetPart segmentation tasks. Code is available at\nhttps://github.com/wangzy22/TAP.\n","authors":["Ziyi Wang","Xumin Yu","Yongming Rao","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2307.14971v2.pdf","comment":"Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz"},{"id":"http://arxiv.org/abs/2212.04031v2","updated":"2023-09-07T03:55:37Z","published":"2022-12-08T02:03:21Z","title":"On Root Cause Localization and Anomaly Mitigation through Causal\n Inference","summary":" Due to a wide spectrum of applications in the real world, such as security,\nfinancial surveillance, and health risk, various deep anomaly detection models\nhave been proposed and achieved state-of-the-art performance. However, besides\nbeing effective, in practice, the practitioners would further like to know what\ncauses the abnormal outcome and how to further fix it. In this work, we propose\nRootCLAM, which aims to achieve Root Cause Localization and Anomaly Mitigation\nfrom a causal perspective. Especially, we formulate anomalies caused by\nexternal interventions on the normal causal mechanism and aim to locate the\nabnormal features with external interventions as root causes. After that, we\nfurther propose an anomaly mitigation approach that aims to recommend\nmitigation actions on abnormal features to revert the abnormal outcomes such\nthat the counterfactuals guided by the causal mechanism are normal. Experiments\non three datasets show that our approach can locate the root causes and further\nflip the abnormal labels.\n","authors":["Xiao Han","Lu Zhang","Yongkai Wu","Shuhan Yuan"],"pdf_url":"https://arxiv.org/pdf/2212.04031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03469v1","updated":"2023-09-07T03:34:51Z","published":"2023-09-07T03:34:51Z","title":"Fast FixMatch: Faster Semi-Supervised Learning with Curriculum Batch\n Size","summary":" Advances in Semi-Supervised Learning (SSL) have almost entirely closed the\ngap between SSL and Supervised Learning at a fraction of the number of labels.\nHowever, recent performance improvements have often come \\textit{at the cost of\nsignificantly increased training computation}. To address this, we propose\nCurriculum Batch Size (CBS), \\textit{an unlabeled batch size curriculum which\nexploits the natural training dynamics of deep neural networks.} A small\nunlabeled batch size is used in the beginning of training and is gradually\nincreased to the end of training. A fixed curriculum is used regardless of\ndataset, model or number of epochs, and reduced training computations is\ndemonstrated on all settings. We apply CBS, strong labeled augmentation,\nCurriculum Pseudo Labeling (CPL) \\citep{FlexMatch} to FixMatch \\citep{FixMatch}\nand term the new SSL algorithm Fast FixMatch. We perform an ablation study to\nshow that strong labeled augmentation and/or CPL do not significantly reduce\ntraining computations, but, in synergy with CBS, they achieve optimal\nperformance. Fast FixMatch also achieves substantially higher data utilization\ncompared to previous state-of-the-art. Fast FixMatch achieves between\n$2.1\\times$ - $3.4\\times$ reduced training computations on CIFAR-10 with all\nbut 40, 250 and 4000 labels removed, compared to vanilla FixMatch, while\nattaining the same cited state-of-the-art error rate \\citep{FixMatch}. Similar\nresults are achieved for CIFAR-100, SVHN and STL-10. Finally, Fast MixMatch\nachieves between $2.6\\times$ - $3.3\\times$ reduced training computations in\nfederated SSL tasks and online/streaming learning SSL tasks, which further\ndemonstrate the generializbility of Fast MixMatch to different scenarios and\ntasks.\n","authors":["John Chen","Chen Dun","Anastasios Kyrillidis"],"pdf_url":"https://arxiv.org/pdf/2309.03469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03468v1","updated":"2023-09-07T03:33:49Z","published":"2023-09-07T03:33:49Z","title":"Cross-Image Context Matters for Bongard Problems","summary":" Current machine learning methods struggle to solve Bongard problems, which\nare a type of IQ test that requires deriving an abstract \"concept\" from a set\nof positive and negative \"support\" images, and then classifying whether or not\na new query image depicts the key concept. On Bongard-HOI, a benchmark for\nnatural-image Bongard problems, existing methods have only reached 66% accuracy\n(where chance is 50%). Low accuracy is often attributed to neural nets' lack of\nability to find human-like symbolic rules. In this work, we point out that many\nexisting methods are forfeiting accuracy due to a much simpler problem: they do\nnot incorporate information contained in the support set as a whole, and rely\ninstead on information extracted from individual supports. This is a critical\nissue, because unlike in few-shot learning tasks concerning object\nclassification, the \"key concept\" in a typical Bongard problem can only be\ndistinguished using multiple positives and multiple negatives. We explore a\nvariety of simple methods to take this cross-image context into account, and\ndemonstrate substantial gains over prior methods, leading to new\nstate-of-the-art performance on Bongard-LOGO (75.3%) and Bongard-HOI (72.45%)\nand strong performance on the original Bongard problem set (60.84%).\n","authors":["Nikhil Raghuraman","Adam W. Harley","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2309.03468v1.pdf","comment":"Main paper: 7 pages, Appendix: 10 pages, 30 figures. Code:\n https://github.com/nraghuraman/bongard-context"},{"id":"http://arxiv.org/abs/2303.04436v2","updated":"2023-09-07T03:25:36Z","published":"2023-03-08T08:31:06Z","title":"A comparison of rational and neural network based approximations","summary":" Rational and neural network based approximations are efficient tools in\nmodern approximation. These approaches are able to produce accurate\napproximations to nonsmooth and non-Lipschitz functions, including multivariate\ndomain functions. In this paper we compare the efficiency of function\napproximation using rational approximation, neural network and their\ncombinations. It was found that rational approximation is superior to neural\nnetwork based approaches with the same number of decision variables. Our\nnumerical experiments demonstrate the efficiency of rational approximation,\neven when the number of approximation parameters (that is, the dimension of the\ncorresponding optimisation problems) is small. Another important contribution\nof this paper lies in the improvement of rational approximation algorithms.\nNamely, the optimisation based algorithms for rational approximation can be\nadjusted to in such a way that the conditioning number of the constraint\nmatrices are controlled. This simple adjustment enables us to work with high\ndimension optimisation problems and improve the design of the neural network.\nThe main strength of neural networks is in their ability to handle models with\na large number of variables: complex models are decomposed in several simple\noptimisation problems. Therefore the the large number of decision variables is\nin the nature of neural networks.\n","authors":["Vinesha Peiris","Reinier Diaz Millan","Nadezda Sukhorukova","Julien Ugon"],"pdf_url":"https://arxiv.org/pdf/2303.04436v2.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2309.03169v2","updated":"2023-09-07T03:13:39Z","published":"2023-09-06T17:09:43Z","title":"Impression-Informed Multi-Behavior Recommender System: A Hierarchical\n Graph Attention Approach","summary":" While recommender systems have significantly benefited from implicit\nfeedback, they have often missed the nuances of multi-behavior interactions\nbetween users and items. Historically, these systems either amalgamated all\nbehaviors, such as \\textit{impression} (formerly \\textit{view}),\n\\textit{add-to-cart}, and \\textit{buy}, under a singular 'interaction' label,\nor prioritized only the target behavior, often the \\textit{buy} action,\ndiscarding valuable auxiliary signals. Although recent advancements tried\naddressing this simplification, they primarily gravitated towards optimizing\nthe target behavior alone, battling with data scarcity. Additionally, they\ntended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these\ngaps, we introduce the \\textbf{H}ierarchical \\textbf{M}ulti-behavior\n\\textbf{G}raph Attention \\textbf{N}etwork (HMGN). This pioneering framework\nleverages attention mechanisms to discern information from both inter and\nintra-behaviors while employing a multi-task Hierarchical Bayesian Personalized\nRanking (HBPR) for optimization. Recognizing the need for scalability, our\napproach integrates a specialized multi-behavior sub-graph sampling technique.\nMoreover, the adaptability of HMGN allows for the seamless inclusion of\nknowledge metadata and time-series data. Empirical results attest to our\nmodel's prowess, registering a notable performance boost of up to 64\\% in\nNDCG@100 metrics over conventional graph neural network methods.\n","authors":["Dong Li","Divya Bhargavi","Vidya Sagar Ravipati"],"pdf_url":"https://arxiv.org/pdf/2309.03169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01108v2","updated":"2023-09-07T02:51:01Z","published":"2023-09-03T07:44:38Z","title":"Acoustic-to-articulatory inversion for dysarthric speech: Are\n pre-trained self-supervised representations favorable?","summary":" $ $Acoustic-to-articulatory inversion (AAI) involves mapping from the\nacoustic space to the articulatory space. Signal-processing features like the\nMFCCs, have been widely used for the AAI task. For subjects with dysarthric\nspeech, AAI is challenging because of an imprecise and indistinct\npronunciation. In this work, we perform AAI for dysarthric speech using\nrepresentations from pre-trained self-supervised learning (SSL) models. We\ndemonstrate the impact of different pre-trained features on this challenging\nAAI task, at low-resource conditions. In addition, we also condition x-vectors\nto the extracted SSL features to train a BLSTM network. In the seen case, we\nexperiment with three AAI training schemes (subject-specific, pooled, and\nfine-tuned). The results, consistent across training schemes, reveal that\nDeCoAR, in the fine-tuned scheme, achieves a relative improvement of the\nPearson Correlation Coefficient (CC) by ${\\sim}$1.81\\% and ${\\sim}$4.56\\% for\nhealthy controls and patients, respectively, over MFCCs. In the unseen case, we\nobserve similar average trends for different SSL features. Overall, SSL\nnetworks like wav2vec, APC, and DeCoAR, which are trained with feature\nreconstruction or future timestep prediction tasks, perform well in predicting\ndysarthric articulatory trajectories.\n","authors":["Sarthak Kumar Maharana","Krishna Kamal Adidam","Shoumik Nandi","Ajitesh Srivastava"],"pdf_url":"https://arxiv.org/pdf/2309.01108v2.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2303.13746v2","updated":"2023-09-07T02:43:57Z","published":"2023-03-24T01:58:58Z","title":"Achieving Occam's Razor: Deep Learning for Optimal Model Reduction","summary":" All fields of science depend on mathematical models. Occam's razor refers to\nthe principle that good models should exclude parameters beyond those minimally\nrequired to describe the systems they represent. This is because redundancy can\nlead to incorrect estimates of model parameters from data, and thus inaccurate\nor ambiguous conclusions. Here, we show how deep learning can be powerfully\nleveraged to address Occam's razor. FixFit, our new method, uses a feedforward\ndeep neural network with a bottleneck layer to characterize and predict the\nbehavior of a given model from its input parameters. FixFit has three major\nbenefits. First, it provides a metric to quantify the original model's degree\nof complexity. Second, it allows for the unique fitting of data. Third, it\nprovides an unbiased way to discriminate between experimental hypotheses that\nadd value versus those that do not. In two use cases, we demonstrate the broad\napplicability of this method across scientific domains. To validate the method\nusing a known system, we apply FixFit to recover known composite parameters for\nthe Kepler orbit model. To illustrate how the method can be applied to less\nwell-established fields, we use it to identify parameters for a multi-scale\nbrain model and reduce the search space for viable candidate mechanisms.\n","authors":["Botond B Antal","Anthony G Chesebro","Helmut H Strey","Lilianne R Mujica-Parodi","Corey Weistuch"],"pdf_url":"https://arxiv.org/pdf/2303.13746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.12198v2","updated":"2023-09-07T02:27:09Z","published":"2022-01-28T15:53:30Z","title":"Limitation of Characterizing Implicit Regularization by Data-independent\n Functions","summary":" In recent years, understanding the implicit regularization of neural networks\n(NNs) has become a central task in deep learning theory. However, implicit\nregularization is itself not completely defined and well understood. In this\nwork, we attempt to mathematically define and study implicit regularization.\nImportantly, we explore the limitations of a common approach to characterizing\nimplicit regularization using data-independent functions. We propose two\ndynamical mechanisms, i.e., Two-point and One-point Overlapping mechanisms,\nbased on which we provide two recipes for producing classes of\none-hidden-neuron NNs that provably cannot be fully characterized by a type of\nor all data-independent functions. Following the previous works, our results\nfurther emphasize the profound data dependency of implicit regularization in\ngeneral, inspiring us to study in detail the data dependency of NN implicit\nregularization in the future.\n","authors":["Leyang Zhang","Zhi-Qin John Xu","Tao Luo","Yaoyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2201.12198v2.pdf","comment":"Revised the structure of paper and added results about implicit\n regularization in training two-layer network or even more general \"activation\n function-related\" models"},{"id":"http://arxiv.org/abs/2309.03452v1","updated":"2023-09-07T02:26:55Z","published":"2023-09-07T02:26:55Z","title":"Multi-Modality Guidance Network For Missing Modality Inference","summary":" Multimodal models have gained significant success in recent years. Standard\nmultimodal approaches often assume unchanged modalities from training stage to\ninference stage. In practice, however, many scenarios fail to satisfy such\nassumptions with missing modalities during inference, leading to limitations on\nwhere multimodal models can be applied. While existing methods mitigate the\nproblem through reconstructing the missing modalities, it increases unnecessary\ncomputational cost, which could be just as critical, especially for large,\ndeployed systems. To solve the problem from both sides, we propose a novel\nguidance network that promotes knowledge sharing during training, taking\nadvantage of the multimodal representations to train better single-modality\nmodels for inference. Real-life experiment in violence detection shows that our\nproposed framework trains single-modality models that significantly outperform\nits traditionally trained counterparts while maintaining the same inference\ncost.\n","authors":["Zhuokai Zhao","Harish Palani","Tianyi Liu","Lena Evans","Ruth Toner"],"pdf_url":"https://arxiv.org/pdf/2309.03452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03451v1","updated":"2023-09-07T02:26:32Z","published":"2023-09-07T02:26:32Z","title":"Cross-domain Sound Recognition for Efficient Underwater Data Analysis","summary":" This paper presents a novel deep learning approach for analyzing massive\nunderwater acoustic data by leveraging a model trained on a broad spectrum of\nnon-underwater (aerial) sounds. Recognizing the challenge in labeling vast\namounts of underwater data, we propose a two-fold methodology to accelerate\nthis labor-intensive procedure.\n The first part of our approach involves PCA and UMAP visualization of the\nunderwater data using the feature vectors of an aerial sound recognition model.\nThis enables us to cluster the data in a two dimensional space and listen to\npoints within these clusters to understand their defining characteristics. This\ninnovative method simplifies the process of selecting candidate labels for\nfurther training.\n In the second part, we train a neural network model using both the selected\nunderwater data and the non-underwater dataset. We conducted a quantitative\nanalysis to measure the precision, recall, and F1 score of our model for\nrecognizing airgun sounds, a common type of underwater sound. The F1 score\nachieved by our model exceeded 84.3%, demonstrating the effectiveness of our\napproach in analyzing underwater acoustic data.\n The methodology presented in this paper holds significant potential to reduce\nthe amount of labor required in underwater data analysis and opens up new\npossibilities for further research in the field of cross-domain data analysis.\n","authors":["Jeongsoo Park","Dong-Gyun Han","Hyoung Sul La","Sangmin Lee","Yoonchang Han","Eun-Jin Yang"],"pdf_url":"https://arxiv.org/pdf/2309.03451v1.pdf","comment":"Accepted to APSIPA 2023"},{"id":"http://arxiv.org/abs/2309.03450v1","updated":"2023-09-07T02:20:03Z","published":"2023-09-07T02:20:03Z","title":"XGen-7B Technical Report","summary":" Large Language Models (LLMs) have become ubiquitous across various domains,\ntransforming the way we interact with information and conduct research.\nHowever, most high-performing LLMs remain confined behind proprietary walls,\nhindering scientific progress. Most open-source LLMs, on the other hand, are\nlimited in their ability to support longer sequence lengths, which is a key\nrequirement for many tasks that require inference over an input context. To\naddress this, we have trained XGen, a series of 7B parameter models on up to 8K\nsequence length for up to 1.5T tokens. We have also finetuned the XGen models\non public-domain instructional data, creating their instruction-tuned\ncounterparts (XGen-Inst). We open-source our models for both research\nadvancements and commercial applications. Our evaluation on standard benchmarks\nshows that XGen models achieve comparable or better results when compared with\nstate-of-the-art open-source LLMs. Our targeted evaluation on long sequence\nmodeling tasks shows the benefits of our 8K-sequence models over 2K-sequence\nopen-source LLMs.\n","authors":["Erik Nijkamp","Tian Xie","Hiroaki Hayashi","Bo Pang","Congying Xia","Chen Xing","Jesse Vig","Semih Yavuz","Philippe Laban","Ben Krause","Senthil Purushwalkam","Tong Niu","Wojciech Kryściński","Lidiya Murakhovs'ka","Prafulla Kumar Choubey","Alex Fabbri","Ye Liu","Rui Meng","Lifu Tu","Meghana Bhat","Chien-Sheng Wu","Silvio Savarese","Yingbo Zhou","Shafiq Joty","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2309.03450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03447v1","updated":"2023-09-07T02:08:30Z","published":"2023-09-07T02:08:30Z","title":"Broadband Ground Motion Synthesis via Generative Adversarial Neural\n Operators: Development and Validation","summary":" We present a data-driven model for ground-motion synthesis using a Generative\nAdversarial Neural Operator (GANO) that combines recent advancements in machine\nlearning and open access strong motion data sets to generate three-component\nacceleration time histories conditioned on moment magnitude ($M$), rupture\ndistance ($R_{rup}$), time-average shear-wave velocity at the top $30m$\n($V_{S30}$), and tectonic environment or style of faulting. We use Neural\nOperators, a resolution invariant architecture that guarantees that the model\ntraining is independent of the data sampling frequency. We first present the\nconditional ground-motion synthesis algorithm (referred to heretofore as\ncGM-GANO) and discuss its advantages compared to previous work. Next, we verify\nthe cGM-GANO framework using simulated ground motions generated with the\nSouthern California Earthquake Center (SCEC) Broadband Platform (BBP). We\nlastly train cGM-GANO on a KiK-net dataset from Japan, showing that the\nframework can recover the magnitude, distance, and $V_{S30}$ scaling of Fourier\namplitude and pseudo-spectral accelerations. We evaluate cGM-GANO through\nresidual analysis with the empirical dataset as well as by comparison with\nconventional Ground Motion Models (GMMs) for selected ground motion scenarios.\nResults show that cGM-GANO produces consistent median scaling with the GMMs for\nthe corresponding tectonic environments. The largest misfit is observed at\nshort distances due to the scarcity of training data. With the exception of\nshort distances, the aleatory variability of the response spectral ordinates is\nalso well captured, especially for subduction events due to the adequacy of\ntraining data. Applications of the presented framework include generation of\nrisk-targeted ground motions for site-specific engineering applications.\n","authors":["Yaozhong Shi","Grigorios Lavrentiadis","Domniki Asimaki","Zachary E. Ross","Kamyar Azizzadenesheli"],"pdf_url":"https://arxiv.org/pdf/2309.03447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03428v4","updated":"2023-09-07T01:49:57Z","published":"2023-03-06T19:00:27Z","title":"Towards provably efficient quantum algorithms for large-scale\n machine-learning models","summary":" Large machine learning models are revolutionary technologies of artificial\nintelligence whose bottlenecks include huge computational expenses, power, and\ntime used both in the pre-training and fine-tuning process. In this work, we\nshow that fault-tolerant quantum computing could possibly provide provably\nefficient resolutions for generic (stochastic) gradient descent algorithms,\nscaling as $\\mathcal{O}(T^2 \\times \\text{polylog}(n))$, where $n$ is the size\nof the models and $T$ is the number of iterations in the training, as long as\nthe models are both sufficiently dissipative and sparse, with small learning\nrates. Based on earlier efficient quantum algorithms for dissipative\ndifferential equations, we find and prove that similar algorithms work for\n(stochastic) gradient descent, the primary algorithm for machine learning. In\npractice, we benchmark instances of large machine learning models from 7\nmillion to 103 million parameters. We find that, in the context of sparse\ntraining, a quantum enhancement is possible at the early stage of learning\nafter model pruning, motivating a sparse parameter download and re-upload\nscheme. Our work shows solidly that fault-tolerant quantum algorithms could\npotentially contribute to most state-of-the-art, large-scale machine-learning\nproblems.\n","authors":["Junyu Liu","Minzhao Liu","Jin-Peng Liu","Ziyu Ye","Yunfei Wang","Yuri Alexeev","Jens Eisert","Liang Jiang"],"pdf_url":"https://arxiv.org/pdf/2303.03428v4.pdf","comment":"6+39 pages, 3+10 figures, substantial detail added"},{"id":"http://arxiv.org/abs/2302.14051v2","updated":"2023-09-07T01:47:22Z","published":"2023-02-27T18:59:55Z","title":"Internet Explorer: Targeted Representation Learning on the Open Web","summary":" Modern vision models typically rely on fine-tuning general-purpose models\npre-trained on large, static datasets. These general-purpose models only\ncapture the knowledge within their pre-training datasets, which are tiny,\nout-of-date snapshots of the Internet -- where billions of images are uploaded\neach day. We suggest an alternate approach: rather than hoping our static\ndatasets transfer to our desired tasks after large-scale pre-training, we\npropose dynamically utilizing the Internet to quickly train a small-scale model\nthat does extremely well on the task at hand. Our approach, called Internet\nExplorer, explores the web in a self-supervised manner to progressively find\nrelevant examples that improve performance on a desired target dataset. It\ncycles between searching for images on the Internet with text queries,\nself-supervised training on downloaded images, determining which images were\nuseful, and prioritizing what to search for next. We evaluate Internet Explorer\nacross several datasets and show that it outperforms or matches CLIP oracle\nperformance by using just a single GPU desktop to actively query the Internet\nfor 30--40 hours. Results, visualizations, and videos at\nhttps://internet-explorer-ssl.github.io/\n","authors":["Alexander C. Li","Ellis Brown","Alexei A. Efros","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2302.14051v2.pdf","comment":"In ICML 2023. Website at https://internet-explorer-ssl.github.io/"},{"id":"http://arxiv.org/abs/2309.03440v1","updated":"2023-09-07T01:46:17Z","published":"2023-09-07T01:46:17Z","title":"Punctate White Matter Lesion Segmentation in Preterm Infants Powered by\n Counterfactually Generative Learning","summary":" Accurate segmentation of punctate white matter lesions (PWMLs) are\nfundamental for the timely diagnosis and treatment of related developmental\ndisorders. Automated PWMLs segmentation from infant brain MR images is\nchallenging, considering that the lesions are typically small and low-contrast,\nand the number of lesions may dramatically change across subjects. Existing\nlearning-based methods directly apply general network architectures to this\nchallenging task, which may fail to capture detailed positional information of\nPWMLs, potentially leading to severe under-segmentations. In this paper, we\npropose to leverage the idea of counterfactual reasoning coupled with the\nauxiliary task of brain tissue segmentation to learn fine-grained positional\nand morphological representations of PWMLs for accurate localization and\nsegmentation. A simple and easy-to-implement deep-learning framework (i.e.,\nDeepPWML) is accordingly designed. It combines the lesion counterfactual map\nwith the tissue probability map to train a lightweight PWML segmentation\nnetwork, demonstrating state-of-the-art performance on a real-clinical dataset\nof infant T1w MR images. The code is available at\n\\href{https://github.com/ladderlab-xjtu/DeepPWML}{https://github.com/ladderlab-xjtu/DeepPWML}.\n","authors":["Zehua Ren","Yongheng Sun","Miaomiao Wang","Yuying Feng","Xianjun Li","Chao Jin","Jian Yang","Chunfeng Lian","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03440v1.pdf","comment":"10 pages, 3 figures, Medical Image Computing and Computer Assisted\n Intervention(MICCAI)"},{"id":"http://arxiv.org/abs/2309.03439v1","updated":"2023-09-07T01:43:47Z","published":"2023-09-07T01:43:47Z","title":"Personalized Tucker Decomposition: Modeling Commonality and Peculiarity\n on Tensor Data","summary":" We propose personalized Tucker decomposition (perTucker) to address the\nlimitations of traditional tensor decomposition methods in capturing\nheterogeneity across different datasets. perTucker decomposes tensor data into\nshared global components and personalized local components. We introduce a mode\northogonality assumption and develop a proximal gradient regularized block\ncoordinate descent algorithm that is guaranteed to converge to a stationary\npoint. By learning unique and common representations across datasets, we\ndemonstrate perTucker's effectiveness in anomaly detection, client\nclassification, and clustering through a simulation study and two case studies\non solar flare detection and tonnage signal classification.\n","authors":["Jiuyun Hu","Naichen Shi","Raed Al Kontar","Hao Yan"],"pdf_url":"https://arxiv.org/pdf/2309.03439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03437v1","updated":"2023-09-07T01:39:02Z","published":"2023-09-07T01:39:02Z","title":"Byzantine-Robust Federated Learning with Variance Reduction and\n Differential Privacy","summary":" Federated learning (FL) is designed to preserve data privacy during model\ntraining, where the data remains on the client side (i.e., IoT devices), and\nonly model updates of clients are shared iteratively for collaborative\nlearning. However, this process is vulnerable to privacy attacks and Byzantine\nattacks: the local model updates shared throughout the FL network will leak\nprivate information about the local training data, and they can also be\nmaliciously crafted by Byzantine attackers to disturb the learning. In this\npaper, we propose a new FL scheme that guarantees rigorous privacy and\nsimultaneously enhances system robustness against Byzantine attacks. Our\napproach introduces sparsification- and momentum-driven variance reduction into\nthe client-level differential privacy (DP) mechanism, to defend against\nByzantine attackers. The security design does not violate the privacy guarantee\nof the client-level DP mechanism; hence, our approach achieves the same\nclient-level DP guarantee as the state-of-the-art. We conduct extensive\nexperiments on both IID and non-IID datasets and different tasks and evaluate\nthe performance of our approach against different Byzantine attacks by\ncomparing it with state-of-the-art defense methods. The results of our\nexperiments show the efficacy of our framework and demonstrate its ability to\nimprove system robustness against Byzantine attacks while achieving a strong\nprivacy guarantee.\n","authors":["Zikai Zhang","Rui Hu"],"pdf_url":"https://arxiv.org/pdf/2309.03437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02556v2","updated":"2023-09-07T01:29:40Z","published":"2023-09-05T19:45:27Z","title":"Domain Adaptation for Efficiently Fine-tuning Vision Transformer with\n Encrypted Images","summary":" In recent years, deep neural networks (DNNs) trained with transformed data\nhave been applied to various applications such as privacy-preserving learning,\naccess control, and adversarial defenses. However, the use of transformed data\ndecreases the performance of models. Accordingly, in this paper, we propose a\nnovel method for fine-tuning models with transformed images under the use of\nthe vision transformer (ViT). The proposed domain adaptation method does not\ncause the accuracy degradation of models, and it is carried out on the basis of\nthe embedding structure of ViT. In experiments, we confirmed that the proposed\nmethod prevents accuracy degradation even when using encrypted images with the\nCIFAR-10 and CIFAR-100 datasets.\n","authors":["Teru Nagamori","Sayaka Shiota","Hitoshi Kiya"],"pdf_url":"https://arxiv.org/pdf/2309.02556v2.pdf","comment":"Accepted by APSIPA 2023"},{"id":"http://arxiv.org/abs/2309.03426v1","updated":"2023-09-07T01:10:01Z","published":"2023-09-07T01:10:01Z","title":"Equal Long-term Benefit Rate: Adapting Static Fairness Notions to\n Sequential Decision Making","summary":" Decisions made by machine learning models may have lasting impacts over time,\nmaking long-term fairness a crucial consideration. It has been shown that when\nignoring the long-term effect, naively imposing fairness criterion in static\nsettings can actually exacerbate bias over time. To explicitly address biases\nin sequential decision-making, recent works formulate long-term fairness\nnotions in Markov Decision Process (MDP) framework. They define the long-term\nbias to be the sum of static bias over each time step. However, we demonstrate\nthat naively summing up the step-wise bias can cause a false sense of fairness\nsince it fails to consider the importance difference of different time steps\nduring transition. In this work, we introduce a long-term fairness notion\ncalled Equal Long-term Benefit Rate (ELBERT), which explicitly considers\nvarying temporal importance and adapts static fairness principles to the\nsequential setting. Moreover, we show that the policy gradient of Long-term\nBenefit Rate can be analytically reduced to standard policy gradient. This\nmakes standard policy optimization methods applicable for reducing the bias,\nleading to our proposed bias mitigation method ELBERT-PO. Experiments on three\nsequential decision making environments show that ELBERT-PO significantly\nreduces bias and maintains high utility. Code is available at\nhttps://github.com/Yuancheng-Xu/ELBERT.\n","authors":["Yuancheng Xu","Chenghao Deng","Yanchao Sun","Ruijie Zheng","Xiyao Wang","Jieyu Zhao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2309.03426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02685v2","updated":"2023-09-07T00:46:47Z","published":"2023-09-06T03:42:20Z","title":"Diffusion-EDFs: Bi-equivariant Denoising Generative Modeling on SE(3)\n for Visual Robotic Manipulation","summary":" Recent studies have verified that equivariant methods can significantly\nimprove the data efficiency, generalizability, and robustness in robot\nlearning. Meanwhile, denoising diffusion-based generative modeling has recently\ngained significant attention as a promising approach for robotic manipulation\nlearning from demonstrations with stochastic behaviors. In this paper, we\npresent Diffusion-EDFs, a novel approach that incorporates spatial\nroto-translation equivariance, i.e., SE(3)-equivariance to diffusion generative\nmodeling. By integrating SE(3)-equivariance into our model architectures, we\ndemonstrate that our proposed method exhibits remarkable data efficiency,\nrequiring only 5 to 10 task demonstrations for effective end-to-end training.\nFurthermore, our approach showcases superior generalizability compared to\nprevious diffusion-based manipulation methods.\n","authors":["Hyunwoo Ryu","Jiwoo Kim","Junwoo Chang","Hyun Seok Ahn","Joohwan Seo","Taehan Kim","Yubin Kim","Jongeun Choi","Roberto Horowitz"],"pdf_url":"https://arxiv.org/pdf/2309.02685v2.pdf","comment":"27 pages, 4 figures"},{"id":"http://arxiv.org/abs/2205.12250v2","updated":"2023-09-07T00:44:17Z","published":"2022-05-24T17:56:56Z","title":"Efficient anti-symmetrization of a neural network layer by taming the\n sign problem","summary":" Explicit antisymmetrization of a neural network is a potential candidate for\na universal function approximator for generic antisymmetric functions, which\nare ubiquitous in quantum physics. However, this procedure is a priori\nfactorially costly to implement, making it impractical for large numbers of\nparticles. The strategy also suffers from a sign problem. Namely, due to\nnear-exact cancellation of positive and negative contributions, the magnitude\nof the antisymmetrized function may be significantly smaller than before\nanti-symmetrization. We show that the anti-symmetric projection of a two-layer\nneural network can be evaluated efficiently, opening the door to using a\ngeneric antisymmetric layer as a building block in anti-symmetric neural\nnetwork Ansatzes. This approximation is effective when the sign problem is\ncontrolled, and we show that this property depends crucially the choice of\nactivation function under standard Xavier/He initialization methods. As a\nconsequence, using a smooth activation function requires re-scaling of the\nneural network weights compared to standard initializations.\n","authors":["Nilin Abrahamsen","Lin Lin"],"pdf_url":"https://arxiv.org/pdf/2205.12250v2.pdf","comment":"To appear in JML, ISSN: 2790-2048(e), 2790-203X(p)"},{"id":"http://arxiv.org/abs/2307.06555v3","updated":"2023-09-07T00:22:22Z","published":"2023-07-13T04:46:05Z","title":"Deep Network Approximation: Beyond ReLU to Diverse Activation Functions","summary":" This paper explores the expressive power of deep neural networks for a\ndiverse range of activation functions. An activation function set $\\mathscr{A}$\nis defined to encompass the majority of commonly used activation functions,\nsuch as $\\mathtt{ReLU}$, $\\mathtt{LeakyReLU}$, $\\mathtt{ReLU}^2$,\n$\\mathtt{ELU}$, $\\mathtt{SELU}$, $\\mathtt{Softplus}$, $\\mathtt{GELU}$,\n$\\mathtt{SiLU}$, $\\mathtt{Swish}$, $\\mathtt{Mish}$, $\\mathtt{Sigmoid}$,\n$\\mathtt{Tanh}$, $\\mathtt{Arctan}$, $\\mathtt{Softsign}$, $\\mathtt{dSiLU}$, and\n$\\mathtt{SRS}$. We demonstrate that for any activation function $\\varrho\\in\n\\mathscr{A}$, a $\\mathtt{ReLU}$ network of width $N$ and depth $L$ can be\napproximated to arbitrary precision by a $\\varrho$-activated network of width\n$4N$ and depth $2L$ on any bounded set. This finding enables the extension of\nmost approximation results achieved with $\\mathtt{ReLU}$ networks to a wide\nvariety of other activation functions, at the cost of slightly larger\nconstants.\n","authors":["Shijun Zhang","Jianfeng Lu","Hongkai Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.06555v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03409v1","updated":"2023-09-07T00:07:15Z","published":"2023-09-07T00:07:15Z","title":"Large Language Models as Optimizers","summary":" Optimization is ubiquitous. While derivative-based algorithms have been\npowerful tools for various problems, the absence of gradient imposes challenges\non many real-world applications. In this work, we propose Optimization by\nPROmpting (OPRO), a simple and effective approach to leverage large language\nmodels (LLMs) as optimizers, where the optimization task is described in\nnatural language. In each optimization step, the LLM generates new solutions\nfrom the prompt that contains previously generated solutions with their values,\nthen the new solutions are evaluated and added to the prompt for the next\noptimization step. We first showcase OPRO on linear regression and traveling\nsalesman problems, then move on to prompt optimization where the goal is to\nfind instructions that maximize the task accuracy. With a variety of LLMs, we\ndemonstrate that the best prompts optimized by OPRO outperform human-designed\nprompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks.\n","authors":["Chengrun Yang","Xuezhi Wang","Yifeng Lu","Hanxiao Liu","Quoc V. Le","Denny Zhou","Xinyun Chen"],"pdf_url":"https://arxiv.org/pdf/2309.03409v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.03905v1","updated":"2023-09-07T17:59:45Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v1.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"},{"id":"http://arxiv.org/abs/2309.03827v1","updated":"2023-09-07T16:40:49Z","published":"2023-09-07T16:40:49Z","title":"ArtHDR-Net: Perceptually Realistic and Accurate HDR Content Creation","summary":" High Dynamic Range (HDR) content creation has become an important topic for\nmodern media and entertainment sectors, gaming and Augmented/Virtual Reality\nindustries. Many methods have been proposed to recreate the HDR counterparts of\ninput Low Dynamic Range (LDR) images/videos given a single exposure or\nmulti-exposure LDRs. The state-of-the-art methods focus primarily on the\npreservation of the reconstruction's structural similarity and the pixel-wise\naccuracy. However, these conventional approaches do not emphasize preserving\nthe artistic intent of the images in terms of human visual perception, which is\nan essential element in media, entertainment and gaming. In this paper, we\nattempt to study and fill this gap. We propose an architecture called\nArtHDR-Net based on a Convolutional Neural Network that uses multi-exposed LDR\nfeatures as input. Experimental results show that ArtHDR-Net can achieve\nstate-of-the-art performance in terms of the HDR-VDP-2 score (i.e., mean\nopinion score index) while reaching competitive performance in terms of PSNR\nand SSIM.\n","authors":["Hrishav Bakul Barua","Ganesh Krishnasamy","KokSheik Wong","Kalin Stefanov","Abhinav Dhall"],"pdf_url":"https://arxiv.org/pdf/2309.03827v1.pdf","comment":"Accepted in Asia Pacific Signal and Information Processing\n Association Annual Summit and Conference (APSIPA ASC), Taipei, Taiwan"},{"id":"http://arxiv.org/abs/2309.03815v1","updated":"2023-09-07T16:12:06Z","published":"2023-09-07T16:12:06Z","title":"T2IW: Joint Text to Image & Watermark Generation","summary":" Recent developments in text-conditioned image generative models have\nrevolutionized the production of realistic results. Unfortunately, this has\nalso led to an increase in privacy violations and the spread of false\ninformation, which requires the need for traceability, privacy protection, and\nother security measures. However, existing text-to-image paradigms lack the\ntechnical capabilities to link traceable messages with image generation. In\nthis study, we introduce a novel task for the joint generation of text to image\nand watermark (T2IW). This T2IW scheme ensures minimal damage to image quality\nwhen generating a compound image by forcing the semantic feature and the\nwatermark signal to be compatible in pixels. Additionally, by utilizing\nprinciples from Shannon information theory and non-cooperative game theory, we\nare able to separate the revealed image and the revealed watermark from the\ncompound image. Furthermore, we strengthen the watermark robustness of our\napproach by subjecting the compound image to various post-processing attacks,\nwith minimal pixel distortion observed in the revealed watermark. Extensive\nexperiments have demonstrated remarkable achievements in image quality,\nwatermark invisibility, and watermark robustness, supported by our proposed set\nof evaluation metrics.\n","authors":["An-An Liu","Guokai Zhang","Yuting Su","Ning Xu","Yongdong Zhang","Lanjun Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16215v3","updated":"2023-09-07T14:41:22Z","published":"2023-08-30T16:44:38Z","title":"Deep Video Codec Control","summary":" Lossy video compression is commonly used when transmitting and storing video\ndata. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard,\ndespite the availability of advanced (neural) compression approaches.\nTransmitting videos in the face of dynamic network bandwidth conditions\nrequires video codecs to adapt to vastly different compression strengths. Rate\ncontrol modules augment the codec's compression such that bandwidth constraints\nare satisfied and video distortion is minimized. While, both standard video\ncodes and their rate control modules are developed to minimize video distortion\nw.r.t. human quality assessment, preserving the downstream performance of deep\nvision models is not considered. In this paper, we present the first end-to-end\nlearnable deep video codec control considering both bandwidth constraints and\ndownstream vision performance, while not breaking existing standardization. We\ndemonstrate for two common vision tasks (semantic segmentation and optical flow\nestimation) and on two different datasets that our deep codec control better\npreserves downstream performance than using 2-pass average bit rate control\nwhile meeting dynamic bandwidth constraints and adhering to standardizations.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Tim Prangemeier","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2308.16215v3.pdf","comment":"22 pages, 26 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.03549v1","updated":"2023-09-07T08:12:58Z","published":"2023-09-07T08:12:58Z","title":"Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation","summary":" Inspired by the remarkable success of Latent Diffusion Models (LDMs) for\nimage synthesis, we study LDM for text-to-video generation, which is a\nformidable challenge due to the computational and memory constraints during\nboth model training and inference. A single LDM is usually only capable of\ngenerating a very limited number of video frames. Some existing works focus on\nseparate prediction models for generating more video frames, which suffer from\nadditional training cost and frame-level jittering, however. In this paper, we\npropose a framework called \"Reuse and Diffuse\" dubbed $\\textit{VidRD}$ to\nproduce more frames following the frames already generated by an LDM.\nConditioned on an initial video clip with a small number of frames, additional\nframes are iteratively generated by reusing the original latent features and\nfollowing the previous diffusion process. Besides, for the autoencoder used for\ntranslation between pixel space and latent space, we inject temporal layers\ninto its decoder and fine-tune these layers for higher temporal consistency. We\nalso propose a set of strategies for composing video-text data that involve\ndiverse content from multiple existing datasets including video datasets for\naction recognition and image-text datasets. Extensive experiments show that our\nmethod achieves good results in both quantitative and qualitative evaluations.\nOur project page is available\n$\\href{https://anonymous0x233.github.io/ReuseAndDiffuse/}{here}$.\n","authors":["Jiaxi Gu","Shicong Wang","Haoyu Zhao","Tianyi Lu","Xing Zhang","Zuxuan Wu","Songcen Xu","Wei Zhang","Yu-Gang Jiang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2309.03549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00398v2","updated":"2023-09-07T08:11:01Z","published":"2023-09-01T11:14:43Z","title":"VideoGen: A Reference-Guided Latent Diffusion Approach for High\n Definition Text-to-Video Generation","summary":" In this paper, we present VideoGen, a text-to-video generation approach,\nwhich can generate a high-definition video with high frame fidelity and strong\ntemporal consistency using reference-guided latent diffusion. We leverage an\noff-the-shelf text-to-image generation model, e.g., Stable Diffusion, to\ngenerate an image with high content quality from the text prompt, as a\nreference image to guide video generation. Then, we introduce an efficient\ncascaded latent diffusion module conditioned on both the reference image and\nthe text prompt, for generating latent video representations, followed by a\nflow-based temporal upsampling step to improve the temporal resolution.\nFinally, we map latent video representations into a high-definition video\nthrough an enhanced video decoder. During training, we use the first frame of a\nground-truth video as the reference image for training the cascaded latent\ndiffusion module. The main characterises of our approach include: the reference\nimage generated by the text-to-image model improves the visual fidelity; using\nit as the condition makes the diffusion model focus more on learning the video\ndynamics; and the video decoder is trained over unlabeled video data, thus\nbenefiting from high-quality easily-available videos. VideoGen sets a new\nstate-of-the-art in text-to-video generation in terms of both qualitative and\nquantitative evaluation. See \\url{https://videogen.github.io/VideoGen/} for\nmore samples.\n","authors":["Xin Li","Wenqing Chu","Ye Wu","Weihang Yuan","Fanglong Liu","Qi Zhang","Fu Li","Haocheng Feng","Errui Ding","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.00398v2.pdf","comment":"8pages, 8figures, project page: https://videogen.github.io/VideoGen/"},{"id":"http://arxiv.org/abs/2309.03542v1","updated":"2023-09-07T08:01:07Z","published":"2023-09-07T08:01:07Z","title":"Zero-Shot Scene Graph Generation via Triplet Calibration and Reduction","summary":" Scene Graph Generation (SGG) plays a pivotal role in downstream\nvision-language tasks. Existing SGG methods typically suffer from poor\ncompositional generalizations on unseen triplets. They are generally trained on\nincompletely annotated scene graphs that contain dominant triplets and tend to\nbias toward these seen triplets during inference. To address this issue, we\npropose a Triplet Calibration and Reduction (T-CAR) framework in this paper. In\nour framework, a triplet calibration loss is first presented to regularize the\nrepresentations of diverse triplets and to simultaneously excavate the unseen\ntriplets in incompletely annotated training scene graphs. Moreover, the unseen\nspace of scene graphs is usually several times larger than the seen space since\nit contains a huge number of unrealistic compositions. Thus, we propose an\nunseen space reduction loss to shift the attention of excavation to reasonable\nunseen compositions to facilitate the model training. Finally, we propose a\ncontextual encoder to improve the compositional generalizations of unseen\ntriplets by explicitly modeling the relative spatial relations between subjects\nand objects. Extensive experiments show that our approach achieves consistent\nimprovements for zero-shot SGG over state-of-the-art methods. The code is\navailable at https://github.com/jkli1998/T-CAR.\n","authors":["Jiankai Li","Yunhong Wang","Weixin Li"],"pdf_url":"https://arxiv.org/pdf/2309.03542v1.pdf","comment":"Accept in TOMM 2023"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..3f4796cf --- /dev/null +++ b/index.html @@ -0,0 +1,68660 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 43 + +
+
+
+ + ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+ + ☆ A Function Interpretation Benchmark for Evaluating Interpretability + Methods + + +
+ Labeling neural network submodules with human-legible descriptions is useful +for many downstream tasks: such descriptions can surface failures, guide +interventions, and perhaps even explain important model behaviors. To date, +most mechanistic descriptions of trained networks have involved small models, +narrowly delimited phenomena, and large amounts of human labor. Labeling all +human-interpretable sub-computations in models of increasing size and +complexity will almost certainly require tools that can generate and validate +descriptions automatically. Recently, techniques that use learned models +in-the-loop for labeling have begun to gain traction, but methods for +evaluating their efficacy are limited and ad-hoc. How should we validate and +compare open-ended labeling tools? This paper introduces FIND (Function +INterpretation and Description), a benchmark suite for evaluating the building +blocks of automated interpretability methods. FIND contains functions that +resemble components of trained neural networks, and accompanying descriptions +of the kind we seek to generate. The functions are procedurally constructed +across textual and numeric domains, and involve a range of real-world +complexities, including noise, composition, approximation, and bias. We +evaluate new and existing methods that use language models (LMs) to produce +code-based and language descriptions of function behavior. We find that an +off-the-shelf LM augmented with only black-box access to functions can +sometimes infer their structure, acting as a scientist by forming hypotheses, +proposing experiments, and updating descriptions in light of new data. However, +LM-based descriptions tend to capture global function behavior and miss local +corruptions. These results show that FIND will be useful for characterizing the +performance of more sophisticated interpretability methods before they are +applied to real-world models. + +
+
+ comment: 25 pages, 7 figures +
+
+
+
+
+ + ☆ Zero-Shot Audio Captioning via Audibility Guidance + + +
+ The task of audio captioning is similar in essence to tasks such as image and +video captioning. However, it has received much less attention. We propose +three desiderata for captioning audio -- (i) fluency of the generated text, +(ii) faithfulness of the generated text to the input audio, and the somewhat +related (iii) audibility, which is the quality of being able to be perceived +based only on audio. Our method is a zero-shot method, i.e., we do not learn to +perform captioning. Instead, captioning occurs as an inference process that +involves three networks that correspond to the three desired qualities: (i) A +Large Language Model, in our case, for reasons of convenience, GPT-2, (ii) A +model that provides a matching score between an audio file and a text, for +which we use a multimodal matching network called ImageBind, and (iii) A text +classifier, trained using a dataset we collected automatically by instructing +GPT-4 with prompts designed to direct the generation of both audible and +inaudible sentences. We present our results on the AudioCap dataset, +demonstrating that audibility guidance significantly enhances performance +compared to the baseline, which lacks this objective. + +
+
+
+
+
+ + ☆ DoLa: Decoding by Contrasting Layers Improves Factuality in Large + Language Models + + +
+ Despite their impressive capabilities, large language models (LLMs) are prone +to hallucinations, i.e., generating content that deviates from facts seen +during pretraining. We propose a simple decoding strategy for reducing +hallucinations with pretrained LLMs that does not require conditioning on +retrieved external knowledge nor additional fine-tuning. Our approach obtains +the next-token distribution by contrasting the differences in logits obtained +from projecting the later layers versus earlier layers to the vocabulary space, +exploiting the fact that factual knowledge in an LLMs has generally been shown +to be localized to particular transformer layers. We find that this Decoding by +Contrasting Layers (DoLa) approach is able to better surface factual knowledge +and reduce the generation of incorrect facts. DoLa consistently improves the +truthfulness across multiple choices tasks and open-ended generation tasks, for +example improving the performance of LLaMA family models on TruthfulQA by +12-17% absolute points, demonstrating its potential in making LLMs reliably +generate truthful facts. + +
+
+ comment: The source code is available at https://github.com/voidism/DoLa +
+
+
+
+
+ + ☆ On Large Language Models' Selection Bias in Multi-Choice Questions + + +
+ Multi-choice questions (MCQs) serve as a common yet important task format in +the research of large language models (LLMs). Our work shows that LLMs exhibit +an inherent "selection bias" in MCQs, which refers to LLMs' preferences to +select options located at specific positions (like "Option C"). This bias is +prevalent across various LLMs, making their performance vulnerable to option +position changes in MCQs. We identify that one primary cause resulting in +selection bias is option numbering, i.e., the ID symbols A/B/C/D associated +with the options. To mitigate selection bias, we propose a new method called +PriDe. PriDe first decomposes the observed model prediction distribution into +an intrinsic prediction over option contents and a prior distribution over +option IDs. It then estimates the prior by permutating option contents on a +small number of test samples, which is used to debias the subsequent test +samples. We demonstrate that, as a label-free, inference-time method, PriDe +achieves a more effective and computation-efficient debiasing than strong +baselines. We further show that the priors estimated by PriDe generalize well +across different domains, highlighting its practical potential in broader +scenarios. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Introducing "Forecast Utterance" for Conversational Data Science + + +
+ Envision an intelligent agent capable of assisting users in conducting +forecasting tasks through intuitive, natural conversations, without requiring +in-depth knowledge of the underlying machine learning (ML) processes. A +significant challenge for the agent in this endeavor is to accurately +comprehend the user's prediction goals and, consequently, formulate precise ML +tasks. In this paper, we take a pioneering step towards this ambitious goal by +introducing a new concept called Forecast Utterance and then focus on the +automatic and accurate interpretation of users' prediction goals from these +utterances. Specifically, we frame the task as a slot-filling problem, where +each slot corresponds to a specific aspect of the goal prediction task. We then +employ two zero-shot methods for solving the slot-filling task, namely: 1) +Entity Extraction (EE), and 2) Question-Answering (QA) techniques. Our +experiments, conducted with three meticulously crafted data sets, validate the +viability of our ambitious goal and demonstrate the effectiveness of both EE +and QA techniques in interpreting Forecast Utterances. + +
+
+
+
+
+ + ☆ OpinionGPT: Modelling Explicit Biases in Instruction-Tuned LLMs + + +
+ Instruction-tuned Large Language Models (LLMs) have recently showcased +remarkable ability to generate fitting responses to natural language +instructions. However, an open research question concerns the inherent biases +of trained models and their responses. For instance, if the data used to tune +an LLM is dominantly written by persons with a specific political bias, we +might expect generated answers to share this bias. Current research work seeks +to de-bias such models, or suppress potentially biased answers. With this +demonstration, we take a different view on biases in instruction-tuning: Rather +than aiming to suppress them, we aim to make them explicit and transparent. To +this end, we present OpinionGPT, a web demo in which users can ask questions +and select all biases they wish to investigate. The demo will answer this +question using a model fine-tuned on text representing each of the selected +biases, allowing side-by-side comparison. To train the underlying model, we +identified 11 different biases (political, geographic, gender, age) and derived +an instruction-tuning corpus in which each answer was written by members of one +of these demographics. This paper presents OpinionGPT, illustrates how we +trained the bias-aware model and showcases the web application (available at +https://opiniongpt.informatik.hu-berlin.de). + +
+
+ comment: 6 pages, 1 figure, 3 tables +
+
+
+
+
+ + ☆ FLM-101B: An Open LLM and How to Train It with $100K Budget + + +
+ Large language models (LLMs) have achieved remarkable success in NLP and +multimodal tasks. Despite these successes, their development faces two main +challenges: (i) high computational cost; and (ii) difficulty in conducting fair +and objective evaluations. LLMs are prohibitively expensive, making it feasible +for only a few major players to undertake their training, thereby constraining +both research and application opportunities. This underscores the importance of +cost-effective LLM training. In this paper, we utilize a growth strategy to +significantly reduce LLM training cost. We demonstrate that an LLM with 101B +parameters and 0.31TB tokens can be trained on a $100K budget. We also adopt a +systematic evaluation paradigm for the IQ evaluation of LLMs, in complement to +existing evaluations that focus more on knowledge-oriented abilities. We +introduce our benchmark including evaluations on important aspects of +intelligence including symbolic mapping, itrule understanding, pattern mining, +and anti-interference. Such evaluations minimize the potential impact of +memorization. Experimental results show that our model FLM-101B, trained with a +budget of $100K, achieves comparable performance to powerful and well-known +models, eg GPT-3 and GLM-130B, especially in the IQ benchmark evaluations with +contexts unseen in training data. The checkpoint of FLM-101B will be +open-sourced at https://huggingface.co/CofeAI/FLM-101B. + +
+
+
+
+
+ + ☆ Uncovering Drift in Textual Data: An Unsupervised Method for Detecting + and Mitigating Drift in Machine Learning Models + + +
+ Drift in machine learning refers to the phenomenon where the statistical +properties of data or context, in which the model operates, change over time +leading to a decrease in its performance. Therefore, maintaining a constant +monitoring process for machine learning model performance is crucial in order +to proactively prevent any potential performance regression. However, +supervised drift detection methods require human annotation and consequently +lead to a longer time to detect and mitigate the drift. In our proposed +unsupervised drift detection method, we follow a two step process. Our first +step involves encoding a sample of production data as the target distribution, +and the model training data as the reference distribution. In the second step, +we employ a kernel-based statistical test that utilizes the maximum mean +discrepancy (MMD) distance metric to compare the reference and target +distributions and estimate any potential drift. Our method also identifies the +subset of production data that is the root cause of the drift. The models +retrained using these identified high drift samples show improved performance +on online customer experience quality metrics. + +
+
+ comment: 8 pages, Accepted in 2023 Amazon Internal Machine Learning Conference +
+
+
+
+
+ + ☆ USA: Universal Sentiment Analysis Model & Construction of Japanese + Sentiment Text Classification and Part of Speech Dataset + + +
+ Sentiment analysis is a pivotal task in the domain of natural language +processing. It encompasses both text-level sentiment polarity classification +and word-level Part of Speech(POS) sentiment polarity determination. Such +analysis challenges models to understand text holistically while also +extracting nuanced information. With the rise of Large Language Models(LLMs), +new avenues for sentiment analysis have opened. This paper proposes enhancing +performance by leveraging the Mutual Reinforcement Effect(MRE) between +individual words and the overall text. It delves into how word polarity +influences the overarching sentiment of a passage. To support our research, we +annotated four novel Sentiment Text Classification and Part of Speech(SCPOS) +datasets, building upon existing sentiment classification datasets. +Furthermore, we developed a Universal Sentiment Analysis(USA) model, with a +7-billion parameter size. Experimental results revealed that our model +surpassed the performance of gpt-3.5-turbo across all four datasets, +underscoring the significance of MRE in sentiment analysis. + +
+
+ comment: 10 pages, 5 figures. Model and Dataset will release soon +
+
+
+
+
+ + ☆ Enhancing Pipeline-Based Conversational Agents with Large Language + Models + + +
+ The latest advancements in AI and deep learning have led to a breakthrough in +large language model (LLM)-based agents such as GPT-4. However, many commercial +conversational agent development tools are pipeline-based and have limitations +in holding a human-like conversation. This paper investigates the capabilities +of LLMs to enhance pipeline-based conversational agents during two phases: 1) +in the design and development phase and 2) during operations. In 1) LLMs can +aid in generating training data, extracting entities and synonyms, +localization, and persona design. In 2) LLMs can assist in contextualization, +intent classification to prevent conversational breakdown and handle +out-of-scope questions, auto-correcting utterances, rephrasing responses, +formulating disambiguation questions, summarization, and enabling closed +question-answering capabilities. We conducted informal experiments with GPT-4 +in the private banking domain to demonstrate the scenarios above with a +practical example. Companies may be hesitant to replace their pipeline-based +agents with LLMs entirely due to privacy concerns and the need for deep +integration within their existing ecosystems. A hybrid approach in which LLMs' +are integrated into the pipeline-based agents allows them to save time and +costs of building and running agents by capitalizing on the capabilities of +LLMs while retaining the integration and privacy safeguards of their existing +systems. + +
+
+
+
+
+ + ☆ The Daunting Dilemma with Sentence Encoders: Success on Standard + Benchmarks, Failure in Capturing Basic Semantic Properties + + +
+ In this paper, we adopted a retrospective approach to examine and compare +five existing popular sentence encoders, i.e., Sentence-BERT, Universal +Sentence Encoder (USE), LASER, InferSent, and Doc2vec, in terms of their +performance on downstream tasks versus their capability to capture basic +semantic properties. Initially, we evaluated all five sentence encoders on the +popular SentEval benchmark and found that multiple sentence encoders perform +quite well on a variety of popular downstream tasks. However, being unable to +find a single winner in all cases, we designed further experiments to gain a +deeper understanding of their behavior. Specifically, we proposed four semantic +evaluation criteria, i.e., Paraphrasing, Synonym Replacement, Antonym +Replacement, and Sentence Jumbling, and evaluated the same five sentence +encoders using these criteria. We found that the Sentence-Bert and USE models +pass the paraphrasing criterion, with SBERT being the superior between the two. +LASER dominates in the case of the synonym replacement criterion. +Interestingly, all the sentence encoders failed the antonym replacement and +jumbling criteria. These results suggest that although these popular sentence +encoders perform quite well on the SentEval benchmark, they still struggle to +capture some basic semantic properties, thus, posing a daunting dilemma in NLP +research. + +
+
+
+
+
+ + ☆ Word segmentation granularity in Korean + + +
+ This paper describes word {segmentation} granularity in Korean language +processing. From a word separated by blank space, which is termed an eojeol, to +a sequence of morphemes in Korean, there are multiple possible levels of word +segmentation granularity in Korean. For specific language processing and corpus +annotation tasks, several different granularity levels have been proposed and +utilized, because the agglutinative languages including Korean language have a +one-to-one mapping between functional morpheme and syntactic category. Thus, we +analyze these different granularity levels, presenting the examples of Korean +language processing systems for future reference. Interestingly, the +granularity by separating only functional morphemes including case markers and +verbal endings, and keeping other suffixes for morphological derivation results +in the optimal performance for phrase structure parsing. This contradicts +previous best practices for Korean language processing, which has been the de +facto standard for various applications that require separating all morphemes. + +
+
+ comment: Accepted for publication in Korean Linguistics (Benjamins) +
+
+
+
+
+ + ☆ Exploring an LM to generate Prolog Predicates from Mathematics Questions + + +
+ Recently, there has been a surge in interest in NLP driven by ChatGPT. +ChatGPT, a transformer-based generative language model of substantial scale, +exhibits versatility in performing various tasks based on natural language. +Nevertheless, large language models often exhibit poor performance in solving +mathematics questions that require reasoning. Prior research has demonstrated +the effectiveness of chain-of-thought prompting in enhancing reasoning +capabilities. Now, we aim to investigate whether fine-tuning a model for the +generation of Prolog codes, a logic language, and subsequently passing these +codes to a compiler can further improve accuracy. Consequently, we employ +chain-of-thought to fine-tune LLaMA7B as a baseline model and develop other +fine-tuned LLaMA7B models for the generation of Prolog code, Prolog code + +chain-of-thought, and chain-of-thought + Prolog code, respectively. The results +reveal that the Prolog generation model surpasses the baseline in performance, +while the combination generation models do not yield significant improvements. +The Prolog corpus based on GSM8K and the correspondingly finetuned Prolog +generation model based on LLaMA7B are released to the research community. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ BNS-Net: A Dual-channel Sarcasm Detection Method Considering + Behavior-level and Sentence-level Conflicts + + +
+ Sarcasm detection is a binary classification task that aims to determine +whether a given utterance is sarcastic. Over the past decade, sarcasm detection +has evolved from classical pattern recognition to deep learning approaches, +where features such as user profile, punctuation and sentiment words have been +commonly employed for sarcasm detection. In real-life sarcastic expressions, +behaviors without explicit sentimental cues often serve as carriers of implicit +sentimental meanings. Motivated by this observation, we proposed a dual-channel +sarcasm detection model named BNS-Net. The model considers behavior and +sentence conflicts in two channels. Channel 1: Behavior-level Conflict Channel +reconstructs the text based on core verbs while leveraging the modified +attention mechanism to highlight conflict information. Channel 2: +Sentence-level Conflict Channel introduces external sentiment knowledge to +segment the text into explicit and implicit sentences, capturing conflicts +between them. To validate the effectiveness of BNS-Net, several comparative and +ablation experiments are conducted on three public sarcasm datasets. The +analysis and evaluation of experimental results demonstrate that the BNS-Net +effectively identifies sarcasm in text and achieves the state-of-the-art +performance. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Evaluating ChatGPT as a Recommender System: A Rigorous Approach + + +
+ Recent popularity surrounds large AI language models due to their impressive +natural language capabilities. They contribute significantly to +language-related tasks, including prompt-based learning, making them valuable +for various specific tasks. This approach unlocks their full potential, +enhancing precision and generalization. Research communities are actively +exploring their applications, with ChatGPT receiving recognition. Despite +extensive research on large language models, their potential in recommendation +scenarios still needs to be explored. This study aims to fill this gap by +investigating ChatGPT's capabilities as a zero-shot recommender system. Our +goals include evaluating its ability to use user preferences for +recommendations, reordering existing recommendation lists, leveraging +information from similar users, and handling cold-start situations. We assess +ChatGPT's performance through comprehensive experiments using three datasets +(MovieLens Small, Last.FM, and Facebook Book). We compare ChatGPT's performance +against standard recommendation algorithms and other large language models, +such as GPT-3.5 and PaLM-2. To measure recommendation effectiveness, we employ +widely-used evaluation metrics like Mean Average Precision (MAP), Recall, +Precision, F1, normalized Discounted Cumulative Gain (nDCG), Item Coverage, +Expected Popularity Complement (EPC), Average Coverage of Long Tail (ACLT), +Average Recommendation Popularity (ARP), and Popularity-based Ranking-based +Equal Opportunity (PopREO). Through thoroughly exploring ChatGPT's abilities in +recommender systems, our study aims to contribute to the growing body of +research on the versatility and potential applications of large language +models. Our experiment code is available on the GitHub repository: +https://github.com/sisinflab/Recommender-ChatGPT + +
+
+
+
+
+ + ☆ Loquacity and Visible Emotion: ChatGPT as a Policy Advisor + + +
+ ChatGPT, a software seeking to simulate human conversational abilities, is +attracting increasing attention. It is sometimes portrayed as a groundbreaking +productivity aid, including for creative work. In this paper, we run an +experiment to assess its potential in complex writing tasks. We ask the +software to compose a policy brief for the Board of the Bank of Italy. We find +that ChatGPT can accelerate workflows by providing well-structured content +suggestions, and by producing extensive, linguistically correct text in a +matter of seconds. It does, however, require a significant amount of expert +supervision, which partially offsets productivity gains. If the app is used +naively, output can be incorrect, superficial, or irrelevant. Superficiality is +an especially problematic limitation in the context of policy advice intended +for high-level audiences. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Supervised Learning vs Large Language Models + for Identifying Cognitive Distortions and Suicidal Risks in Chinese Social + Media + + +
+ Large language models, particularly those akin to the rapidly progressing GPT +series, are gaining traction for their expansive influence. While there is keen +interest in their applicability within medical domains such as psychology, +tangible explorations on real-world data remain scant. Concurrently, users on +social media platforms are increasingly vocalizing personal sentiments; under +specific thematic umbrellas, these sentiments often manifest as negative +emotions, sometimes escalating to suicidal inclinations. Timely discernment of +such cognitive distortions and suicidal risks is crucial to effectively +intervene and potentially avert dire circumstances. Our study ventured into +this realm by experimenting on two pivotal tasks: suicidal risk and cognitive +distortion identification on Chinese social media platforms. Using supervised +learning as a baseline, we examined and contrasted the efficacy of large +language models via three distinct strategies: zero-shot, few-shot, and +fine-tuning. Our findings revealed a discernible performance gap between the +large language models and traditional supervised learning approaches, primarily +attributed to the models' inability to fully grasp subtle categories. Notably, +while GPT-4 outperforms its counterparts in multiple scenarios, GPT-3.5 shows +significant enhancement in suicide risk classification after fine-tuning. To +our knowledge, this investigation stands as the maiden attempt at gauging large +language models on Chinese social media tasks. This study underscores the +forward-looking and transformative implications of using large language models +in the field of psychology. It lays the groundwork for future applications in +psychological research and practice. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ All Labels Together: Low-shot Intent Detection with an Efficient Label + Semantic Encoding Paradigm AACL 2023 + + +
+ In intent detection tasks, leveraging meaningful semantic information from +intent labels can be particularly beneficial for few-shot scenarios. However, +existing few-shot intent detection methods either ignore the intent labels, +(e.g. treating intents as indices) or do not fully utilize this information +(e.g. only using part of the intent labels). In this work, we present an +end-to-end One-to-All system that enables the comparison of an input utterance +with all label candidates. The system can then fully utilize label semantics in +this way. Experiments on three few-shot intent detection tasks demonstrate that +One-to-All is especially effective when the training resource is extremely +scarce, achieving state-of-the-art performance in 1-, 3- and 5-shot settings. +Moreover, we present a novel pretraining strategy for our model that utilizes +indirect supervision from paraphrasing, enabling zero-shot cross-domain +generalization on intent detection tasks. Our code is at +https://github.com/jiangshdd/AllLablesTogethe. + +
+
+ comment: Accepted by IJCNLP-AACL 2023 +
+
+
+
+
+ + ☆ An Anchor Learning Approach for Citation Field Learning + + +
+ Citation field learning is to segment a citation string into fields of +interest such as author, title, and venue. Extracting such fields from +citations is crucial for citation indexing, researcher profile analysis, etc. +User-generated resources like academic homepages and Curriculum Vitae, provide +rich citation field information. However, extracting fields from these +resources is challenging due to inconsistent citation styles, incomplete +sentence syntax, and insufficient training data. To address these challenges, +we propose a novel algorithm, CIFAL (citation field learning by anchor +learning), to boost the citation field learning performance. CIFAL leverages +the anchor learning, which is model-agnostic for any Pre-trained Language +Model, to help capture citation patterns from the data of different citation +styles. The experiments demonstrate that CIFAL outperforms state-of-the-art +methods in citation field learning, achieving a 2.83% improvement in +field-level F1-scores. Extensive analysis of the results further confirms the +effectiveness of CIFAL quantitatively and qualitatively. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Machine Learning for Tangible Effects: Natural Language Processing for + Uncovering the Illicit Massage Industry & Computer Vision for Tactile Sensing + + +
+ I explore two questions in this thesis: how can computer science be used to +fight human trafficking? And how can computer vision create a sense of touch? + I use natural language processing (NLP) to monitor the United States illicit +massage industry (IMI), a multi-billion dollar industry that offers not just +therapeutic massages but also commercial sexual services. Employees of this +industry are often immigrant women with few job opportunities, leaving them +vulnerable to fraud, coercion, and other facets of human trafficking. +Monitoring spatiotemporal trends helps prevent trafficking in the IMI. By +creating datasets with three publicly-accessible websites: Google Places, +Rubmaps, and AMPReviews, combined with NLP techniques such as bag-of-words and +Word2Vec, I show how to derive insights into the labor pressures and language +barriers that employees face, as well as the income, demographics, and societal +pressures affecting sex buyers. I include a call-to-action to other researchers +given these datasets. I also consider how to creating synthetic financial data, +which can aid with counter-trafficking in the banking sector. I use an +agent-based model to create both tabular and payee-recipient graph data. + I then consider the role of computer vision in making tactile sensors. I +report on a novel sensor, the Digger Finger, that adapts the Gelsight sensor to +finding objects in granular media. Changes include using a wedge shape to +facilitate digging, replacing the internal lighting LEDs with fluorescent +paint, and adding a vibrator motor to counteract jamming. Finally, I also show +how to use a webcam and a printed reference marker, or fiducial, to create a +low-cost six-axis force-torque sensor. This sensor is up to a hundred times +less expensive than commercial sensors, allowing for a wider range of +applications. For this and earlier chapters I release design files and code as +open source. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ XGen-7B Technical Report + + +
+ Large Language Models (LLMs) have become ubiquitous across various domains, +transforming the way we interact with information and conduct research. +However, most high-performing LLMs remain confined behind proprietary walls, +hindering scientific progress. Most open-source LLMs, on the other hand, are +limited in their ability to support longer sequence lengths, which is a key +requirement for many tasks that require inference over an input context. To +address this, we have trained XGen, a series of 7B parameter models on up to 8K +sequence length for up to 1.5T tokens. We have also finetuned the XGen models +on public-domain instructional data, creating their instruction-tuned +counterparts (XGen-Inst). We open-source our models for both research +advancements and commercial applications. Our evaluation on standard benchmarks +shows that XGen models achieve comparable or better results when compared with +state-of-the-art open-source LLMs. Our targeted evaluation on long sequence +modeling tasks shows the benefits of our 8K-sequence models over 2K-sequence +open-source LLMs. + +
+
+
+
+
+ + ☆ Improving Open Information Extraction with Large Language Models: A + Study on Demonstration Uncertainty + + +
+ Open Information Extraction (OIE) task aims at extracting structured facts +from unstructured text, typically in the form of (subject, relation, object) +triples. Despite the potential of large language models (LLMs) like ChatGPT as +a general task solver, they lag behind state-of-the-art (supervised) methods in +OIE tasks due to two key issues. First, LLMs struggle to distinguish irrelevant +context from relevant relations and generate structured output due to the +restrictions on fine-tuning the model. Second, LLMs generates responses +autoregressively based on probability, which makes the predicted relations lack +confidence. In this paper, we assess the capabilities of LLMs in improving the +OIE task. Particularly, we propose various in-context learning strategies to +enhance LLM's instruction-following ability and a demonstration uncertainty +quantification module to enhance the confidence of the generated relations. Our +experiments on three OIE benchmark datasets show that our approach holds its +own against established supervised methods, both quantitatively and +qualitatively. + +
+
+
+
+
+ + ☆ From Base to Conversational: Japanese Instruction Dataset and Tuning + Large Language Models + + +
+ Instruction tuning is essential for large language models (LLMs) to become +interactive. While many instruction tuning datasets exist in English, there is +a noticeable lack in other languages. Also, their effectiveness has not been +well verified in non-English languages. We construct a Japanese instruction +dataset by expanding and filtering existing datasets and apply the dataset to a +Japanese pre-trained base model. We performed Low-Rank Adaptation (LoRA) tuning +on both Japanese and English existing models using our instruction dataset. We +evaluated these models from both quantitative and qualitative perspectives. As +a result, the effectiveness of Japanese instruction datasets is confirmed. The +results also indicate that even with relatively small LLMs, performances in +downstream tasks would be improved through instruction tuning. Our instruction +dataset, tuned models, and implementation are publicly available online. + +
+
+
+
+
+ + ☆ Large Language Models as Optimizers + + +
+ Optimization is ubiquitous. While derivative-based algorithms have been +powerful tools for various problems, the absence of gradient imposes challenges +on many real-world applications. In this work, we propose Optimization by +PROmpting (OPRO), a simple and effective approach to leverage large language +models (LLMs) as optimizers, where the optimization task is described in +natural language. In each optimization step, the LLM generates new solutions +from the prompt that contains previously generated solutions with their values, +then the new solutions are evaluated and added to the prompt for the next +optimization step. We first showcase OPRO on linear regression and traveling +salesman problems, then move on to prompt optimization where the goal is to +find instructions that maximize the task accuracy. With a variety of LLMs, we +demonstrate that the best prompts optimized by OPRO outperform human-designed +prompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks. + +
+
+
+
+
+ + ♻ ☆ Transformers as Support Vector Machines + + +
+ Since its inception in "Attention Is All You Need", transformer architecture +has led to revolutionary advancements in NLP. The attention layer within the +transformer admits a sequence of input tokens $X$ and makes them interact +through pairwise similarities computed as softmax$(XQK^\top X^\top)$, where +$(K,Q)$ are the trainable key-query parameters. In this work, we establish a +formal equivalence between the optimization geometry of self-attention and a +hard-margin SVM problem that separates optimal input tokens from non-optimal +tokens using linear constraints on the outer-products of token pairs. This +formalism allows us to characterize the implicit bias of 1-layer transformers +optimized with gradient descent: (1) Optimizing the attention layer with +vanishing regularization, parameterized by $(K,Q)$, converges in direction to +an SVM solution minimizing the nuclear norm of the combined parameter +$W=KQ^\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm +objective. We characterize this convergence, highlighting that it can occur +toward locally-optimal directions rather than global ones. (2) Complementing +this, we prove the local/global directional convergence of gradient descent +under suitable geometric conditions. Importantly, we show that +over-parameterization catalyzes global convergence by ensuring the feasibility +of the SVM problem and by guaranteeing a benign optimization landscape devoid +of stationary points. (3) While our theory applies primarily to linear +prediction heads, we propose a more general SVM equivalence that predicts the +implicit bias with nonlinear heads. Our findings are applicable to arbitrary +datasets and their validity is verified via experiments. We also introduce +several open problems and research directions. We believe these findings +inspire the interpretation of transformers as a hierarchy of SVMs that +separates and selects optimal tokens. + +
+
+ comment: minor edits and update global convergence figure +
+
+
+
+
+ + ♻ ☆ Max-Margin Token Selection in Attention Mechanism + + +
+ Attention mechanism is a central component of the transformer architecture +which led to the phenomenal success of large language models. However, the +theoretical principles underlying the attention mechanism are poorly +understood, especially its nonconvex optimization dynamics. In this work, we +explore the seminal softmax-attention model $f(\boldsymbol{X})=\langle +\boldsymbol{Xv}, \texttt{softmax}(\boldsymbol{XWp})\rangle$, where +$\boldsymbol{X}$ is the token sequence and +$(\boldsymbol{v},\boldsymbol{W},\boldsymbol{p})$ are trainable parameters. We +prove that running gradient descent on $\boldsymbol{p}$, or equivalently +$\boldsymbol{W}$, converges in direction to a max-margin solution that +separates $\textit{locally-optimal}$ tokens from non-optimal ones. This clearly +formalizes attention as an optimal token selection mechanism. Remarkably, our +results are applicable to general data and precisely characterize +$\textit{optimality}$ of tokens in terms of the value embeddings +$\boldsymbol{Xv}$ and problem geometry. We also provide a broader +regularization path analysis that establishes the margin maximizing nature of +attention even for nonlinear prediction heads. When optimizing $\boldsymbol{v}$ +and $\boldsymbol{p}$ simultaneously with logistic loss, we identify conditions +under which the regularization paths directionally converge to their respective +hard-margin SVM solutions where $\boldsymbol{v}$ separates the input features +based on their labels. Interestingly, the SVM formulation of $\boldsymbol{p}$ +is influenced by the support vector geometry of $\boldsymbol{v}$. Finally, we +verify our theoretical findings via numerical experiments and provide insights. + +
+
+ comment: minor edits and update convergence analysis figure +
+
+
+
+
+ + ♻ ☆ LM-Infinite: Simple On-the-Fly Length Generalization for Large Language + Models + + +
+ In recent years, there have been remarkable advancements in the performance +of Transformer-based Large Language Models (LLMs) across various domains. As +these LLMs are deployed for increasingly complex tasks, they often face the +need to conduct longer reasoning processes or understand larger contexts. In +these situations, the length generalization failure of LLMs on long sequences +becomes more prominent. Most pre-training schemes truncate training sequences +to a fixed length. LLMs often struggle to generate fluent and coherent texts, +let alone carry out downstream tasks, after longer contexts, even with relative +positional encoding designed to cope with this problem. Common solutions such +as finetuning on longer corpora often involve daunting hardware and time costs +and require careful training process design. To more efficiently leverage the +generation capacity of existing LLMs, we theoretically and empirically +investigate the main out-of-distribution (OOD) factors contributing to this +problem. Inspired by this diagnosis, we propose a simple yet effective solution +for on-the-fly length generalization, LM-Infinite. It involves only a +$\Lambda$-shaped attention mask (to avoid excessive attended tokens) and a +distance limit (to avoid unseen distances) while requiring no parameter updates +or learning. We find it applicable to a variety of LLMs using relative-position +encoding methods. LM-Infinite is computationally efficient with $O(n)$ time and +space, and demonstrates consistent text generation fluency and quality to as +long as 32k tokens on ArXiv and OpenWebText2 datasets, with 2.72x decoding +speedup. On downstream tasks such as passkey retrieval, it continues to work on +inputs much longer than training lengths where vanilla models fail immediately. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ BigText-QA: Question Answering over a Large-Scale Hybrid Knowledge Graph + + +
+ Answering complex questions over textual resources remains a challenge, +particularly when dealing with nuanced relationships between multiple entities +expressed within natural-language sentences. To this end, curated knowledge +bases (KBs) like YAGO, DBpedia, Freebase, and Wikidata have been widely used +and gained great acceptance for question-answering (QA) applications in the +past decade. While these KBs offer a structured knowledge representation, they +lack the contextual diversity found in natural-language sources. To address +this limitation, BigText-QA introduces an integrated QA approach, which is able +to answer questions based on a more redundant form of a knowledge graph (KG) +that organizes both structured and unstructured (i.e., "hybrid") knowledge in a +unified graphical representation. Thereby, BigText-QA is able to combine the +best of both worlds$\unicode{x2013}$a canonical set of named entities, mapped +to a structured background KB (such as YAGO or Wikidata), as well as an open +set of textual clauses providing highly diversified relational paraphrases with +rich context information. Our experimental results demonstrate that BigText-QA +outperforms DrQA, a neural-network-based QA system, and achieves competitive +results to QUEST, a graph-based unsupervised QA system. + +
+
+
+
+
+ + ♻ ☆ ToolAlpaca: Generalized Tool Learning for Language Models with 3000 + Simulated Cases + + +
+ Enabling large language models to utilize real-world tools effectively is +crucial for achieving embodied intelligence. Existing approaches to tool +learning have either primarily relied on extremely large language models, such +as GPT-4, to attain generalized tool-use abilities in a zero-shot manner, or +utilized supervised learning to train limited scopes of tools on compact +models. However, it remains uncertain whether smaller language models can +achieve generalized tool-use abilities without tool-specific training. To +address this question, this paper introduces ToolAlpaca, a novel framework +designed to automatically generate a diverse tool-use corpus and learn +generalized tool-use abilities on compact language models with minimal human +intervention. Specifically, ToolAlpaca first automatically creates a highly +diversified tool-use corpus by building a multi-agent simulation environment. +The corpus contains 3938 tool-use instances from more than 400 real-world tool +APIs spanning 50 distinct categories. Subsequently, the constructed corpus is +employed to fine-tune compact language models, resulting in two models, namely +ToolAlpaca-7B and ToolAlpaca-13B, respectively. Finally, we evaluate the +ability of these models to utilize previously unseen tools without specific +training. Experimental results demonstrate that ToolAlpaca achieves effective +generalized tool-use capabilities comparable to those of extremely large +language models like GPT-3.5, demonstrating that learning generalized tool-use +ability is feasible for compact language models. + +
+
+
+
+
+ + ♻ ☆ Structured Chain-of-Thought Prompting for Code Generation + + +
+ Large Language Models (LLMs) (e.g., ChatGPT) have shown impressive +performance in code generation. LLMs take prompts as inputs, and +Chain-of-Thought (CoT) prompting is the state-of-the-art prompting technique. +CoT prompting asks LLMs first to generate CoTs (i.e., intermediate natural +language reasoning steps) and then output the code. However, CoT prompting is +designed for natural language generation and has low accuracy in code +generation. + In this paper, we propose Structured CoTs (SCoTs) and present a novel +prompting technique for code generation, named SCoT prompting. Our motivation +is source code contains rich structural information and any code can be +composed of three program structures (i.e., sequence, branch, and loop +structures). Intuitively, structured intermediate reasoning steps make for +structured source code. Thus, we ask LLMs to use program structures to build +CoTs, obtaining SCoTs. Then, LLMs generate the final code based on SCoTs. +Compared to CoT prompting, SCoT prompting explicitly constrains LLMs to think +about how to solve requirements from the view of source code and further the +performance of LLMs in code generation. We apply SCoT prompting to two LLMs +(i.e., ChatGPT and Codex) and evaluate it on three benchmarks (i.e., HumanEval, +MBPP, and MBCPP). (1) SCoT prompting outperforms the state-of-the-art baseline +- CoT prompting by up to 13.79% in Pass@1. (2) Human evaluation shows human +developers prefer programs from SCoT prompting. (3) SCoT prompting is robust to +examples and achieves substantial improvements. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.17780 +
+
+
+
+
+ + ♻ ☆ ZC3: Zero-Shot Cross-Language Code Clone Detection + + +
+ Developers introduce code clones to improve programming productivity. Many +existing studies have achieved impressive performance in monolingual code clone +detection. However, during software development, more and more developers write +semantically equivalent programs with different languages to support different +platforms and help developers translate projects from one language to another. +Considering that collecting cross-language parallel data, especially for +low-resource languages, is expensive and time-consuming, how designing an +effective cross-language model that does not rely on any parallel data is a +significant problem. In this paper, we propose a novel method named ZC3 for +Zero-shot Cross-language Code Clone detection. ZC3 designs the contrastive +snippet prediction to form an isomorphic representation space among different +programming languages. Based on this, ZC3 exploits domain-aware learning and +cycle consistency learning to further constrain the model to generate +representations that are aligned among different languages meanwhile are +diacritical for different types of clones. To evaluate our approach, we conduct +extensive experiments on four representative cross-language clone detection +datasets. Experimental results show that ZC3 outperforms the state-of-the-art +baselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively. +We further investigate the representational distribution of different languages +and discuss the effectiveness of our method. + +
+
+ comment: Accepted by the 38th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2023) +
+
+
+
+
+ + ♻ ☆ EditSum: A Retrieve-and-Edit Framework for Source Code Summarization + + +
+ Existing studies show that code summaries help developers understand and +maintain source code. Unfortunately, these summaries are often missing or +outdated in software projects. Code summarization aims to generate natural +language descriptions automatically for source code. Code summaries are highly +structured and have repetitive patterns. Besides the patternized words, a code +summary also contains important keywords, which are the key to reflecting the +functionality of the code. However, the state-of-the-art approaches perform +poorly on predicting the keywords, which leads to the generated summaries +suffering a loss in informativeness. To alleviate this problem, this paper +proposes a novel retrieve-and-edit approach named EditSum for code +summarization. Specifically, EditSum first retrieves a similar code snippet +from a pre-defined corpus and treats its summary as a prototype summary to +learn the pattern. Then, EditSum edits the prototype automatically to combine +the pattern in the prototype with the semantic information of input code. Our +motivation is that the retrieved prototype provides a good start-point for +post-generation because the summaries of similar code snippets often have the +same pattern. The post-editing process further reuses the patternized words in +the prototype and generates keywords based on the semantic information of input +code. We conduct experiments on a large-scale Java corpus and experimental +results demonstrate that EditSum outperforms the state-of-the-art approaches by +a substantial margin. The human evaluation also proves the summaries generated +by EditSum are more informative and useful. We also verify that EditSum +performs well on predicting the patternized words and keywords. + +
+
+ comment: Accepted by the 36th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2021) +
+
+
+
+
+ + ♻ ☆ Ladder-of-Thought: Using Knowledge as Steps to Elevate Stance Detection + + +
+ Stance detection aims to identify the attitude expressed in a document +towards a given target. Techniques such as Chain-of-Thought (CoT) prompting +have advanced this task, enhancing a model's reasoning capabilities through the +derivation of intermediate rationales. However, CoT relies primarily on a +model's pre-trained internal knowledge during reasoning, thereby neglecting the +valuable external information that is previously unknown to the model. This +omission, especially within the unsupervised reasoning process, can affect the +model's overall performance. Moreover, while CoT enhances Large Language Models +(LLMs), smaller LMs, though efficient operationally, face challenges in +delivering nuanced reasoning. In response to these identified gaps, we +introduce the Ladder-of-Thought (LoT) for the stance detection task. +Constructed through a dual-phase Progressive Optimization Framework, LoT +directs the small LMs to assimilate high-quality external knowledge, refining +the intermediate rationales produced. These bolstered rationales subsequently +serve as the foundation for more precise predictions - akin to how a ladder +facilitates reaching elevated goals. LoT achieves a balance between efficiency +and performance. Our empirical evaluations underscore LoT's efficacy, marking a +16% improvement over GPT-3.5 and a 10% enhancement compared to GPT-3.5 with CoT +on stance detection task. + +
+
+ comment: 5 pages, 2 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Claim Optimization in Computational Argumentation + + +
+ An optimal delivery of arguments is key to persuasion in any debate, both for +humans and for AI systems. This requires the use of clear and fluent claims +relevant to the given debate. Prior work has studied the automatic assessment +of argument quality extensively. Yet, no approach actually improves the quality +so far. To fill this gap, this paper proposes the task of claim optimization: +to rewrite argumentative claims in order to optimize their delivery. As +multiple types of optimization are possible, we approach this task by first +generating a diverse set of candidate claims using a large language model, such +as BART, taking into account contextual information. Then, the best candidate +is selected using various quality metrics. In automatic and human evaluation on +an English-language corpus, our quality-based candidate selection outperforms +several baselines, improving 60% of all claims (worsening 16% only). Follow-up +analyses reveal that, beyond copy editing, our approach often specifies claims +with details, whereas it adds less evidence than humans do. Moreover, its +capabilities generalize well to other domains, such as instructional texts. + +
+
+ comment: Accepted as a long paper at INLG 2023 +
+
+
+
+
+ + ♻ ☆ Layout and Task Aware Instruction Prompt for Zero-shot Document Image + Question Answering + + +
+ Layout-aware pre-trained models has achieved significant progress on document +image question answering. They introduce extra learnable modules into existing +language models to capture layout information within document images from text +bounding box coordinates obtained by OCR tools. However, extra modules +necessitate pre-training on extensive document images. This prevents these +methods from directly utilizing off-the-shelf instruction-tuning language +foundation models, which have recently shown promising potential in zero-shot +learning. Instead, in this paper, we find that instruction-tuning language +models like Claude and ChatGPT can understand layout by spaces and line breaks. +Based on this observation, we propose the LAyout and Task aware Instruction +Prompt (LATIN-Prompt), which consists of layout-aware document content and +task-aware instruction. Specifically, the former uses appropriate spaces and +line breaks to recover the layout information among text segments obtained by +OCR tools, and the latter ensures that generated answers adhere to formatting +requirements. Moreover, we propose the LAyout and Task aware Instruction Tuning +(LATIN-Tuning) to improve the performance of small instruction-tuning models +like Alpaca. Experimental results show that LATIN-Prompt enables zero-shot +performance of Claude and ChatGPT to be comparable to the fine-tuning +performance of SOTAs on document image question answering, and LATIN-Tuning +enhances the zero-shot performance of Alpaca significantly. For example, +LATIN-Prompt improves the performance of Claude and ChatGPT on DocVQA by 263% +and 20% respectively. LATIN-Tuning improves the performance of Alpaca on DocVQA +by 87.7%. Quantitative and qualitative analyses demonstrate the effectiveness +of LATIN-Prompt and LATIN-Tuning. We provide the code in supplementary and will +release it to facilitate future research. + +
+
+ comment: Add the LATIN-Tuning for Alapca. Code is available at + https://github.com/WenjinW/LATIN-Prompt +
+
+
+
+
+ + ♻ ☆ A Survey on Large Language Model based Autonomous Agents + + +
+ Autonomous agents have long been a prominent research focus in both academic +and industry communities. Previous research in this field often focuses on +training agents with limited knowledge within isolated environments, which +diverges significantly from human learning processes, and thus makes the agents +hard to achieve human-like decisions. Recently, through the acquisition of vast +amounts of web knowledge, large language models (LLMs) have demonstrated +remarkable potential in achieving human-level intelligence. This has sparked an +upsurge in studies investigating LLM-based autonomous agents. In this paper, we +present a comprehensive survey of these studies, delivering a systematic review +of the field of LLM-based autonomous agents from a holistic perspective. More +specifically, we first discuss the construction of LLM-based autonomous agents, +for which we propose a unified framework that encompasses a majority of the +previous work. Then, we present a comprehensive overview of the diverse +applications of LLM-based autonomous agents in the fields of social science, +natural science, and engineering. Finally, we delve into the evaluation +strategies commonly used for LLM-based autonomous agents. Based on the previous +studies, we also present several challenges and future directions in this +field. To keep track of this field and continuously update our survey, we +maintain a repository of relevant references at +https://github.com/Paitesanshi/LLM-Agent-Survey. + +
+
+ comment: 35 pages, 5 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Halo: Estimation and Reduction of Hallucinations in Open-Source Weak + Large Language Models + + +
+ Large Language Models (LLMs) have revolutionized Natural Language Processing +(NLP). Although convenient for research and practical applications, open-source +LLMs with fewer parameters often suffer from severe hallucinations compared to +their larger counterparts. This paper focuses on measuring and reducing +hallucinations in BLOOM 7B, a representative of such weaker open-source LLMs +that are publicly available for research and commercial applications. We +introduce HaloCheck, a lightweight BlackBox knowledge-free framework designed +to quantify the severity of hallucinations in LLMs. Additionally, we explore +techniques like knowledge injection and teacher-student approaches to alleviate +hallucinations in low-parameter LLMs. Our experiments effectively demonstrate +the reduction of hallucinations in challenging domains for these LLMs. + +
+
+
+
+
+ + ♻ ☆ Prompting Multilingual Large Language Models to Generate Code-Mixed + Texts: The Case of South East Asian Languages + + +
+ While code-mixing is a common linguistic practice in many parts of the world, +collecting high-quality and low-cost code-mixed data remains a challenge for +natural language processing (NLP) research. The recent proliferation of Large +Language Models (LLMs) compels one to ask: how capable are these systems in +generating code-mixed data? In this paper, we explore prompting multilingual +LLMs in a zero-shot manner to generate code-mixed data for seven languages in +South East Asia (SEA), namely Indonesian, Malay, Chinese, Tagalog, Vietnamese, +Tamil, and Singlish. We find that publicly available multilingual +instruction-tuned models such as BLOOMZ and Flan-T5-XXL are incapable of +producing texts with phrases or clauses from different languages. ChatGPT +exhibits inconsistent capabilities in generating code-mixed texts, wherein its +performance varies depending on the prompt template and language pairing. For +instance, ChatGPT generates fluent and natural Singlish texts (an English-based +creole spoken in Singapore), but for English-Tamil language pair, the system +mostly produces grammatically incorrect or semantically meaningless utterances. +Furthermore, it may erroneously introduce languages not specified in the +prompt. Based on our investigation, existing multilingual LLMs exhibit a wide +range of proficiency in code-mixed data generation for SEA languages. As such, +we advise against using LLMs in this context without extensive human checks. + +
+
+
+
+
+ + ♻ ☆ Aligning Large Language Models for Clinical Tasks + + +
+ Large Language Models (LLMs) have demonstrated remarkable adaptability, +showcasing their capacity to excel in tasks for which they were not explicitly +trained. However, despite their impressive natural language processing (NLP) +capabilities, effective alignment of LLMs remains a crucial challenge when +deploying them for specific clinical applications. The ability to generate +responses with factually accurate content and to engage in non-trivial +reasoning steps are crucial for the LLMs to be eligible for applications in +clinical medicine. Employing a combination of techniques including +instruction-tuning and in-prompt strategies like few-shot and chain-of-thought +prompting has significantly enhanced the performance of LLMs. Our proposed +alignment strategy for medical question-answering, known as +'expand-guess-refine', offers a parameter and data-efficient solution. A +preliminary analysis of this method demonstrated outstanding performance, +achieving a score of 70.63% on a subset of questions sourced from the USMLE +dataset. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Zero-shot information extraction from radiological reports using ChatGPT + + +
+ Electronic health records contain an enormous amount of valuable information, +but many are recorded in free text. Information extraction is the strategy to +transform the sequence of characters into structured data, which can be +employed for secondary analysis. However, the traditional information +extraction components, such as named entity recognition and relation +extraction, require annotated data to optimize the model parameters, which has +become one of the major bottlenecks in building information extraction systems. +With the large language models achieving good performances on various +downstream NLP tasks without parameter tuning, it becomes possible to use large +language models for zero-shot information extraction. In this study, we aim to +explore whether the most popular large language model, ChatGPT, can extract +useful information from the radiological reports. We first design the prompt +template for the interested information in the CT reports. Then, we generate +the prompts by combining the prompt template with the CT reports as the inputs +of ChatGPT to obtain the responses. A post-processing module is developed to +transform the responses into structured extraction results. We conducted the +experiments with 847 CT reports collected from Peking University Cancer +Hospital. The experimental results indicate that ChatGPT can achieve +competitive performances for some extraction tasks compared with the baseline +information extraction system, but some limitations need to be further +improved. + +
+
+
+
+
+ + ♻ ☆ HAE-RAE Bench: Evaluation of Korean Knowledge in Language Models + + +
+ Large Language Models (LLMs) pretrained on massive corpora exhibit remarkable +capabilities across a wide range of tasks, however, the attention given to +non-English languages has been limited in this field of research. To address +this gap and assess the proficiency of language models in the Korean language +and culture, we present HAE-RAE Bench, covering 6 tasks including vocabulary, +history, and general knowledge. Our evaluation of language models on this +benchmark highlights the potential advantages of employing Large +Language-Specific Models(LLSMs) over a comprehensive, universal model like +GPT-3.5. Remarkably, our study reveals that models approximately 13 times +smaller than GPT-3.5 can exhibit similar performance levels in terms of +language-specific knowledge retrieval. This observation underscores the +importance of homogeneous corpora for training professional-level +language-specific models. On the contrary, we also observe a perplexing +performance dip in these smaller LMs when they are tasked to generate +structured answers. + +
+
+
+
+
+ + ♻ ☆ Automating Behavioral Testing in Machine Translation + + +
+ Behavioral testing in NLP allows fine-grained evaluation of systems by +examining their linguistic capabilities through the analysis of input-output +behavior. Unfortunately, existing work on behavioral testing in Machine +Translation (MT) is currently restricted to largely handcrafted tests covering +a limited range of capabilities and languages. To address this limitation, we +propose to use Large Language Models (LLMs) to generate a diverse set of source +sentences tailored to test the behavior of MT models in a range of situations. +We can then verify whether the MT model exhibits the expected behavior through +matching candidate sets that are also generated using LLMs. Our approach aims +to make behavioral testing of MT systems practical while requiring only minimal +human effort. In our experiments, we apply our proposed evaluation framework to +assess multiple available MT systems, revealing that while in general +pass-rates follow the trends observable from traditional accuracy-based +metrics, our method was able to uncover several important differences and +potential bugs that go unnoticed when relying only on accuracy. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 120 + +
+
+
+ + ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+ + ☆ Exploring Sparse MoE in GANs for Text-conditioned Image Synthesis + + +
+ Due to the difficulty in scaling up, generative adversarial networks (GANs) +seem to be falling from grace on the task of text-conditioned image synthesis. +Sparsely-activated mixture-of-experts (MoE) has recently been demonstrated as a +valid solution to training large-scale models with limited computational +resources. Inspired by such a philosophy, we present Aurora, a GAN-based +text-to-image generator that employs a collection of experts to learn feature +processing, together with a sparse router to help select the most suitable +expert for each feature point. To faithfully decode the sampling stochasticity +and the text condition to the final synthesis, our router adaptively makes its +decision by taking into account the text-integrated global latent code. At +64x64 image resolution, our model trained on LAION2B-en and COYO-700M achieves +6.2 zero-shot FID on MS COCO. We release the code and checkpoints to facilitate +the community for further development. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Tracking Anything with Decoupled Video Segmentation ICCV 2023 + + +
+ Training data for video segmentation are expensive to annotate. This impedes +extensions of end-to-end algorithms to new video segmentation tasks, especially +in large-vocabulary settings. To 'track anything' without training on video +data for every individual task, we develop a decoupled video segmentation +approach (DEVA), composed of task-specific image-level segmentation and +class/task-agnostic bi-directional temporal propagation. Due to this design, we +only need an image-level model for the target task (which is cheaper to train) +and a universal temporal propagation model which is trained once and +generalizes across tasks. To effectively combine these two modules, we use +bi-directional propagation for (semi-)online fusion of segmentation hypotheses +from different frames to generate a coherent segmentation. We show that this +decoupled formulation compares favorably to end-to-end approaches in several +data-scarce tasks including large-vocabulary video panoptic segmentation, +open-world video segmentation, referring video segmentation, and unsupervised +video object segmentation. Code is available at: +https://hkchengrex.github.io/Tracking-Anything-with-DEVA + +
+
+ comment: Accepted to ICCV 2023. Project page: + https://hkchengrex.github.io/Tracking-Anything-with-DEVA +
+
+
+
+
+ + ☆ Learning Continuous Exposure Value Representations for Single-Image HDR + Reconstruction ICCV 2023 + + +
+ Deep learning is commonly used to reconstruct HDR images from LDR images. LDR +stack-based methods are used for single-image HDR reconstruction, generating an +HDR image from a deep learning-generated LDR stack. However, current methods +generate the stack with predetermined exposure values (EVs), which may limit +the quality of HDR reconstruction. To address this, we propose the continuous +exposure value representation (CEVR), which uses an implicit function to +generate LDR images with arbitrary EVs, including those unseen during training. +Our approach generates a continuous stack with more images containing diverse +EVs, significantly improving HDR reconstruction. We use a cycle training +strategy to supervise the model in generating continuous EV LDR images without +corresponding ground truths. Our CEVR model outperforms existing methods, as +demonstrated by experimental results. + +
+
+ comment: ICCV 2023. Project page: https://skchen1993.github.io/CEVR_web/ +
+
+
+
+
+ + ☆ The Making and Breaking of Camouflage ICCV 2023 + + +
+ Not all camouflages are equally effective, as even a partially visible +contour or a slight color difference can make the animal stand out and break +its camouflage. In this paper, we address the question of what makes a +camouflage successful, by proposing three scores for automatically assessing +its effectiveness. In particular, we show that camouflage can be measured by +the similarity between background and foreground features and boundary +visibility. We use these camouflage scores to assess and compare all available +camouflage datasets. We also incorporate the proposed camouflage score into a +generative model as an auxiliary loss and show that effective camouflage images +or videos can be synthesised in a scalable manner. The generated synthetic +dataset is used to train a transformer-based model for segmenting camouflaged +animals in videos. Experimentally, we demonstrate state-of-the-art camouflage +breaking performance on the public MoCA-Mask benchmark. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ ProPainter: Improving Propagation and Transformer for Video Inpainting ICCV 2023 + + +
+ Flow-based propagation and spatiotemporal Transformer are two mainstream +mechanisms in video inpainting (VI). Despite the effectiveness of these +components, they still suffer from some limitations that affect their +performance. Previous propagation-based approaches are performed separately +either in the image or feature domain. Global image propagation isolated from +learning may cause spatial misalignment due to inaccurate optical flow. +Moreover, memory or computational constraints limit the temporal range of +feature propagation and video Transformer, preventing exploration of +correspondence information from distant frames. To address these issues, we +propose an improved framework, called ProPainter, which involves enhanced +ProPagation and an efficient Transformer. Specifically, we introduce +dual-domain propagation that combines the advantages of image and feature +warping, exploiting global correspondences reliably. We also propose a +mask-guided sparse video Transformer, which achieves high efficiency by +discarding unnecessary and redundant tokens. With these components, ProPainter +outperforms prior arts by a large margin of 1.46 dB in PSNR while maintaining +appealing efficiency. + +
+
+ comment: Accepted by ICCV 2023. Code: https://github.com/sczhou/ProPainter +
+
+
+
+
+ + ☆ InstructDiffusion: A Generalist Modeling Interface for Vision Tasks + + +
+ We present InstructDiffusion, a unifying and generic framework for aligning +computer vision tasks with human instructions. Unlike existing approaches that +integrate prior knowledge and pre-define the output space (e.g., categories and +coordinates) for each vision task, we cast diverse vision tasks into a +human-intuitive image-manipulating process whose output space is a flexible and +interactive pixel space. Concretely, the model is built upon the diffusion +process and is trained to predict pixels according to user instructions, such +as encircling the man's left shoulder in red or applying a blue mask to the +left car. InstructDiffusion could handle a variety of vision tasks, including +understanding tasks (such as segmentation and keypoint detection) and +generative tasks (such as editing and enhancement). It even exhibits the +ability to handle unseen tasks and outperforms prior methods on novel datasets. +This represents a significant step towards a generalist modeling interface for +vision tasks, advancing artificial general intelligence in the field of +computer vision. + +
+
+
+
+
+ + ☆ DiffusionEngine: Diffusion Model is Scalable Data Engine for Object + Detection + + +
+ Data is the cornerstone of deep learning. This paper reveals that the +recently developed Diffusion Model is a scalable data engine for object +detection. Existing methods for scaling up detection-oriented data often +require manual collection or generative models to obtain target images, +followed by data augmentation and labeling to produce training pairs, which are +costly, complex, or lacking diversity. To address these issues, we +presentDiffusionEngine (DE), a data scaling-up engine that provides +high-quality detection-oriented training pairs in a single stage. DE consists +of a pre-trained diffusion model and an effective Detection-Adapter, +contributing to generating scalable, diverse and generalizable detection data +in a plug-and-play manner. Detection-Adapter is learned to align the implicit +semantic and location knowledge in off-the-shelf diffusion models with +detection-aware signals to make better bounding-box predictions. Additionally, +we contribute two datasets, i.e., COCO-DE and VOC-DE, to scale up existing +detection benchmarks for facilitating follow-up research. Extensive experiments +demonstrate that data scaling-up via DE can achieve significant improvements in +diverse scenarios, such as various detection algorithms, self-supervised +pre-training, data-sparse, label-scarce, cross-domain, and semi-supervised +learning. For example, when using DE with a DINO-based adapter to scale up +data, mAP is improved by 3.1% on COCO, 7.6% on VOC, and 11.5% on Clipart. + +
+
+ comment: Code and Models are publicly available. Project Page: + https://mettyz.github.io/DiffusionEngine +
+
+
+
+
+ + ☆ ArtiGrasp: Physically Plausible Synthesis of Bi-Manual Dexterous + Grasping and Articulation + + +
+ We present ArtiGrasp, a novel method to synthesize bi-manual hand-object +interactions that include grasping and articulation. This task is challenging +due to the diversity of the global wrist motions and the precise finger control +that are necessary to articulate objects. ArtiGrasp leverages reinforcement +learning and physics simulations to train a policy that controls the global and +local hand pose. Our framework unifies grasping and articulation within a +single policy guided by a single hand pose reference. Moreover, to facilitate +the training of the precise finger control required for articulation, we +present a learning curriculum with increasing difficulty. It starts with +single-hand manipulation of stationary objects and continues with multi-agent +training including both hands and non-stationary objects. To evaluate our +method, we introduce Dynamic Object Grasping and Articulation, a task that +involves bringing an object into a target articulated pose. This task requires +grasping, relocation, and articulation. We show our method's efficacy towards +this task. We further demonstrate that our method can generate motions with +noisy hand-object pose estimates from an off-the-shelf image-based regressor. + +
+
+ comment: Project page: https://eth-ait.github.io/artigrasp/ +
+
+
+
+
+ + ☆ Better Practices for Domain Adaptation + + +
+ Distribution shifts are all too common in real-world applications of machine +learning. Domain adaptation (DA) aims to address this by providing various +frameworks for adapting models to the deployment data without using labels. +However, the domain shift scenario raises a second more subtle challenge: the +difficulty of performing hyperparameter optimisation (HPO) for these adaptation +algorithms without access to a labelled validation set. The unclear validation +protocol for DA has led to bad practices in the literature, such as performing +HPO using the target test labels when, in real-world scenarios, they are not +available. This has resulted in over-optimism about DA research progress +compared to reality. In this paper, we analyse the state of DA when using good +evaluation practice, by benchmarking a suite of candidate validation criteria +and using them to assess popular adaptation algorithms. We show that there are +challenges across all three branches of domain adaptation methodology including +Unsupervised Domain Adaptation (UDA), Source-Free Domain Adaptation (SFDA), and +Test Time Adaptation (TTA). While the results show that realistically +achievable performance is often worse than expected, they also show that using +proper validation splits is beneficial, as well as showing that some previously +unexplored validation metrics provide the best options to date. Altogether, our +improved practices covering data, training, validation and hyperparameter +optimisation form a new rigorous pipeline to improve benchmarking, and hence +research progress, within this important field going forward. + +
+
+ comment: AutoML 2023 (Best paper award) +
+
+
+
+
+ + ☆ Box-based Refinement for Weakly Supervised and Unsupervised Localization + Tasks + + +
+ It has been established that training a box-based detector network can +enhance the localization performance of weakly supervised and unsupervised +methods. Moreover, we extend this understanding by demonstrating that these +detectors can be utilized to improve the original network, paving the way for +further advancements. To accomplish this, we train the detectors on top of the +network output instead of the image data and apply suitable loss +backpropagation. Our findings reveal a significant improvement in phrase +grounding for the ``what is where by looking'' task, as well as various methods +of unsupervised object discovery. Our code is available at +https://github.com/eyalgomel/box-based-refinement. + +
+
+
+
+
+ + ☆ Text-to-feature diffusion for audio-visual few-shot learning + + +
+ Training deep learning models for video classification from audio-visual data +commonly requires immense amounts of labeled training data collected via a +costly process. A challenging and underexplored, yet much cheaper, setup is +few-shot learning from video data. In particular, the inherently multi-modal +nature of video data with sound and visual information has not been leveraged +extensively for the few-shot video classification task. Therefore, we introduce +a unified audio-visual few-shot video classification benchmark on three +datasets, i.e. the VGGSound-FSL, UCF-FSL, ActivityNet-FSL datasets, where we +adapt and compare ten methods. In addition, we propose AV-DIFF, a +text-to-feature diffusion framework, which first fuses the temporal and +audio-visual features via cross-modal attention and then generates multi-modal +features for the novel classes. We show that AV-DIFF obtains state-of-the-art +performance on our proposed benchmark for audio-visual (generalised) few-shot +learning. Our benchmark paves the way for effective audio-visual classification +when only limited labeled data is available. Code and data are available at +https://github.com/ExplainableML/AVDIFF-GFSL. + +
+
+ comment: DAGM GCPR 2023 +
+
+
+
+
+ + ☆ CenTime: Event-Conditional Modelling of Censoring in Survival Analysis + + +
+ Survival analysis is a valuable tool for estimating the time until specific +events, such as death or cancer recurrence, based on baseline observations. +This is particularly useful in healthcare to prognostically predict clinically +important events based on patient data. However, existing approaches often have +limitations; some focus only on ranking patients by survivability, neglecting +to estimate the actual event time, while others treat the problem as a +classification task, ignoring the inherent time-ordered structure of the +events. Furthermore, the effective utilization of censored samples - training +data points where the exact event time is unknown - is essential for improving +the predictive accuracy of the model. In this paper, we introduce CenTime, a +novel approach to survival analysis that directly estimates the time to event. +Our method features an innovative event-conditional censoring mechanism that +performs robustly even when uncensored data is scarce. We demonstrate that our +approach forms a consistent estimator for the event model parameters, even in +the absence of uncensored data. Furthermore, CenTime is easily integrated with +deep learning models with no restrictions on batch size or the number of +uncensored samples. We compare our approach with standard survival analysis +methods, including the Cox proportional-hazard model and DeepHit. Our results +indicate that CenTime offers state-of-the-art performance in predicting +time-to-death while maintaining comparable ranking performance. Our +implementation is publicly available at +https://github.com/ahmedhshahin/CenTime. + +
+
+
+
+
+ + ☆ Cross-Task Attention Network: Improving Multi-Task Learning for Medical + Imaging Applications + + +
+ Multi-task learning (MTL) is a powerful approach in deep learning that +leverages the information from multiple tasks during training to improve model +performance. In medical imaging, MTL has shown great potential to solve various +tasks. However, existing MTL architectures in medical imaging are limited in +sharing information across tasks, reducing the potential performance +improvements of MTL. In this study, we introduce a novel attention-based MTL +framework to better leverage inter-task interactions for various tasks from +pixel-level to image-level predictions. Specifically, we propose a Cross-Task +Attention Network (CTAN) which utilizes cross-task attention mechanisms to +incorporate information by interacting across tasks. We validated CTAN on four +medical imaging datasets that span different domains and tasks including: +radiation treatment planning prediction using planning CT images of two +different target cancers (Prostate, OpenKBP); pigmented skin lesion +segmentation and diagnosis using dermatoscopic images (HAM10000); and COVID-19 +diagnosis and severity prediction using chest CT scans (STOIC). Our study +demonstrates the effectiveness of CTAN in improving the accuracy of medical +imaging tasks. Compared to standard single-task learning (STL), CTAN +demonstrated a 4.67% improvement in performance and outperformed both widely +used MTL baselines: hard parameter sharing (HPS) with an average performance +improvement of 3.22%; and multi-task attention network (MTAN) with a relative +decrease of 5.38%. These findings highlight the significance of our proposed +MTL framework in solving medical imaging tasks and its potential to improve +their accuracy across domains. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ☆ ArtHDR-Net: Perceptually Realistic and Accurate HDR Content Creation SC + + +
+ High Dynamic Range (HDR) content creation has become an important topic for +modern media and entertainment sectors, gaming and Augmented/Virtual Reality +industries. Many methods have been proposed to recreate the HDR counterparts of +input Low Dynamic Range (LDR) images/videos given a single exposure or +multi-exposure LDRs. The state-of-the-art methods focus primarily on the +preservation of the reconstruction's structural similarity and the pixel-wise +accuracy. However, these conventional approaches do not emphasize preserving +the artistic intent of the images in terms of human visual perception, which is +an essential element in media, entertainment and gaming. In this paper, we +attempt to study and fill this gap. We propose an architecture called +ArtHDR-Net based on a Convolutional Neural Network that uses multi-exposed LDR +features as input. Experimental results show that ArtHDR-Net can achieve +state-of-the-art performance in terms of the HDR-VDP-2 score (i.e., mean +opinion score index) while reaching competitive performance in terms of PSNR +and SSIM. + +
+
+ comment: Accepted in Asia Pacific Signal and Information Processing + Association Annual Summit and Conference (APSIPA ASC), Taipei, Taiwan +
+
+
+
+
+ + ☆ T2IW: Joint Text to Image & Watermark Generation + + +
+ Recent developments in text-conditioned image generative models have +revolutionized the production of realistic results. Unfortunately, this has +also led to an increase in privacy violations and the spread of false +information, which requires the need for traceability, privacy protection, and +other security measures. However, existing text-to-image paradigms lack the +technical capabilities to link traceable messages with image generation. In +this study, we introduce a novel task for the joint generation of text to image +and watermark (T2IW). This T2IW scheme ensures minimal damage to image quality +when generating a compound image by forcing the semantic feature and the +watermark signal to be compatible in pixels. Additionally, by utilizing +principles from Shannon information theory and non-cooperative game theory, we +are able to separate the revealed image and the revealed watermark from the +compound image. Furthermore, we strengthen the watermark robustness of our +approach by subjecting the compound image to various post-processing attacks, +with minimal pixel distortion observed in the revealed watermark. Extensive +experiments have demonstrated remarkable achievements in image quality, +watermark invisibility, and watermark robustness, supported by our proposed set +of evaluation metrics. + +
+
+
+
+
+ + ☆ AnthroNet: Conditional Generation of Humans via Anthropometrics + + +
+ We present a novel human body model formulated by an extensive set of +anthropocentric measurements, which is capable of generating a wide range of +human body shapes and poses. The proposed model enables direct modeling of +specific human identities through a deep generative architecture, which can +produce humans in any arbitrary pose. It is the first of its kind to have been +trained end-to-end using only synthetically generated data, which not only +provides highly accurate human mesh representations but also allows for precise +anthropometry of the body. Moreover, using a highly diverse animation library, +we articulated our synthetic humans' body and hands to maximize the diversity +of the learnable priors for model training. Our model was trained on a dataset +of $100k$ procedurally-generated posed human meshes and their corresponding +anthropometric measurements. Our synthetic data generator can be used to +generate millions of unique human identities and poses for non-commercial +academic research purposes. + +
+
+ comment: AnthroNet's Unity data generator source code is available at: + https://unity-technologies.github.io/AnthroNet/ +
+
+
+
+
+ + ☆ Panoramas from Photons ICCV 2023 + + +
+ Scene reconstruction in the presence of high-speed motion and low +illumination is important in many applications such as augmented and virtual +reality, drone navigation, and autonomous robotics. Traditional motion +estimation techniques fail in such conditions, suffering from too much blur in +the presence of high-speed motion and strong noise in low-light conditions. +Single-photon cameras have recently emerged as a promising technology capable +of capturing hundreds of thousands of photon frames per second thanks to their +high speed and extreme sensitivity. Unfortunately, traditional computer vision +techniques are not well suited for dealing with the binary-valued photon data +captured by these cameras because these are corrupted by extreme Poisson noise. +Here we present a method capable of estimating extreme scene motion under +challenging conditions, such as low light or high dynamic range, from a +sequence of high-speed image frames such as those captured by a single-photon +camera. Our method relies on iteratively improving a motion estimate by +grouping and aggregating frames after-the-fact, in a stratified manner. We +demonstrate the creation of high-quality panoramas under fast motion and +extremely low light, and super-resolution results using a custom single-photon +camera prototype. For code and supplemental material see our +$\href{https://wisionlab.com/project/panoramas-from-photons/}{\text{project +webpage}}$. + +
+
+ comment: Proc. ICCV 2023 +
+
+
+
+
+ + ☆ SimNP: Learning Self-Similarity Priors Between Neural Points ICCV 2023 + + +
+ Existing neural field representations for 3D object reconstruction either (1) +utilize object-level representations, but suffer from low-quality details due +to conditioning on a global latent code, or (2) are able to perfectly +reconstruct the observations, but fail to utilize object-level prior knowledge +to infer unobserved regions. We present SimNP, a method to learn category-level +self-similarities, which combines the advantages of both worlds by connecting +neural point radiance fields with a category-level self-similarity +representation. Our contribution is two-fold. (1) We design the first neural +point representation on a category level by utilizing the concept of coherent +point clouds. The resulting neural point radiance fields store a high level of +detail for locally supported object regions. (2) We learn how information is +shared between neural points in an unconstrained and unsupervised fashion, +which allows to derive unobserved regions of an object during the +reconstruction process from given observations. We show that SimNP is able to +outperform previous methods in reconstructing symmetric unseen object regions, +surpassing methods that build upon category-level or pixel-aligned radiance +fields, while providing semantic correspondences between instances + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ FisheyePP4AV: A privacy-preserving method for autonomous vehicles on + fisheye camera images + + +
+ In many parts of the world, the use of vast amounts of data collected on +public roadways for autonomous driving has increased. In order to detect and +anonymize pedestrian faces and nearby car license plates in actual road-driving +scenarios, there is an urgent need for effective solutions. As more data is +collected, privacy concerns regarding it increase, including but not limited to +pedestrian faces and surrounding vehicle license plates. Normal and fisheye +cameras are the two common camera types that are typically mounted on +collection vehicles. With complex camera distortion models, fisheye camera +images were deformed in contrast to regular images. It causes computer vision +tasks to perform poorly when using numerous deep learning models. In this work, +we pay particular attention to protecting privacy while yet adhering to several +laws for fisheye camera photos taken by driverless vehicles. First, we suggest +a framework for extracting face and plate identification knowledge from several +teacher models. Our second suggestion is to transform both the image and the +label from a regular image to fisheye-like data using a varied and realistic +fisheye transformation. Finally, we run a test using the open-source PP4AV +dataset. The experimental findings demonstrated that our model outperformed +baseline methods when trained on data from autonomous vehicles, even when the +data were softly labeled. The implementation code is available at our github: +https://github.com/khaclinh/FisheyePP4AV. + +
+
+
+
+
+ + ☆ Deep Learning Safety Concerns in Automated Driving Perception + + +
+ Recent advances in the field of deep learning and impressive performance of +deep neural networks (DNNs) for perception have resulted in an increased demand +for their use in automated driving (AD) systems. The safety of such systems is +of utmost importance and thus requires to consider the unique properties of +DNNs. + In order to achieve safety of AD systems with DNN-based perception components +in a systematic and comprehensive approach, so-called safety concerns have been +introduced as a suitable structuring element. On the one hand, the concept of +safety concerns is -- by design -- well aligned to existing standards relevant +for safety of AD systems such as ISO 21448 (SOTIF). On the other hand, it has +already inspired several academic publications and upcoming standards on AI +safety such as ISO PAS 8800. + While the concept of safety concerns has been previously introduced, this +paper extends and refines it, leveraging feedback from various domain and +safety experts in the field. In particular, this paper introduces an additional +categorization for a better understanding as well as enabling cross-functional +teams to jointly address the concerns. + +
+
+
+
+
+ + ☆ $L_{2,1}$-Norm Regularized Quaternion Matrix Completion Using Sparse + Representation and Quaternion QR Decomposition + + +
+ Color image completion is a challenging problem in computer vision, but +recent research has shown that quaternion representations of color images +perform well in many areas. These representations consider the entire color +image and effectively utilize coupling information between the three color +channels. Consequently, low-rank quaternion matrix completion (LRQMC) +algorithms have gained significant attention. We propose a method based on +quaternion Qatar Riyal decomposition (QQR) and quaternion $L_{2,1}$-norm called +QLNM-QQR. This new approach reduces computational complexity by avoiding the +need to calculate the QSVD of large quaternion matrices. We also present two +improvements to the QLNM-QQR method: an enhanced version called IRQLNM-QQR that +uses iteratively reweighted quaternion $L_{2,1}$-norm minimization and a method +called QLNM-QQR-SR that integrates sparse regularization. Our experiments on +natural color images and color medical images show that IRQLNM-QQR outperforms +QLNM-QQR and that the proposed QLNM-QQR-SR method is superior to several +state-of-the-art methods. + +
+
+
+
+
+ + ☆ dacl1k: Real-World Bridge Damage Dataset Putting Open-Source Data to the + Test + + +
+ Recognising reinforced concrete defects (RCDs) is a crucial element for +determining the structural integrity, traffic safety and durability of bridges. +However, most of the existing datasets in the RCD domain are derived from a +small number of bridges acquired in specific camera poses, lighting conditions +and with fixed hardware. These limitations question the usability of models +trained on such open-source data in real-world scenarios. We address this +problem by testing such models on our "dacl1k" dataset, a highly diverse RCD +dataset for multi-label classification based on building inspections including +1,474 images. Thereby, we trained the models on different combinations of +open-source data (meta datasets) which were subsequently evaluated both +extrinsically and intrinsically. During extrinsic evaluation, we report metrics +on dacl1k and the meta datasets. The performance analysis on dacl1k shows +practical usability of the meta data, where the best model shows an Exact Match +Ratio of 32%. Additionally, we conduct an intrinsic evaluation by clustering +the bottleneck features of the best model derived from the extrinsic evaluation +in order to find out, if the model has learned distinguishing datasets or the +classes (RCDs) which is the aspired goal. The dacl1k dataset and our trained +models will be made publicly available, enabling researchers and practitioners +to put their models to the real-world test. + +
+
+
+
+
+ + ☆ M(otion)-mode Based Prediction of Ejection Fraction using + Echocardiograms + + +
+ Early detection of cardiac dysfunction through routine screening is vital for +diagnosing cardiovascular diseases. An important metric of cardiac function is +the left ventricular ejection fraction (EF), where lower EF is associated with +cardiomyopathy. Echocardiography is a popular diagnostic tool in cardiology, +with ultrasound being a low-cost, real-time, and non-ionizing technology. +However, human assessment of echocardiograms for calculating EF is +time-consuming and expertise-demanding, raising the need for an automated +approach. In this work, we propose using the M(otion)-mode of echocardiograms +for estimating the EF and classifying cardiomyopathy. We generate multiple +artificial M-mode images from a single echocardiogram and combine them using +off-the-shelf model architectures. Additionally, we extend contrastive learning +(CL) to cardiac imaging to learn meaningful representations from exploiting +structures in unlabeled data allowing the model to achieve high accuracy, even +with limited annotations. Our experiments show that the supervised setting +converges with only ten modes and is comparable to the baseline method while +bypassing its cumbersome training process and being computationally much more +efficient. Furthermore, CL using M-mode images is helpful for limited data +scenarios, such as having labels for only 200 patients, which is common in +medical applications. + +
+
+ comment: Accepted at GCPR 2023 +
+
+
+
+
+ + ☆ PBP: Path-based Trajectory Prediction for Autonomous Driving + + +
+ Trajectory prediction plays a crucial role in the autonomous driving stack by +enabling autonomous vehicles to anticipate the motion of surrounding agents. +Goal-based prediction models have gained traction in recent years for +addressing the multimodal nature of future trajectories. Goal-based prediction +models simplify multimodal prediction by first predicting 2D goal locations of +agents and then predicting trajectories conditioned on each goal. However, a +single 2D goal location serves as a weak inductive bias for predicting the +whole trajectory, often leading to poor map compliance, i.e., part of the +trajectory going off-road or breaking traffic rules. In this paper, we improve +upon goal-based prediction by proposing the Path-based prediction (PBP) +approach. PBP predicts a discrete probability distribution over reference paths +in the HD map using the path features and predicts trajectories in the +path-relative Frenet frame. We applied the PBP trajectory decoder on top of the +HiVT scene encoder and report results on the Argoverse dataset. Our experiments +show that PBP achieves competitive performance on the standard trajectory +prediction metrics, while significantly outperforming state-of-the-art +baselines in terms of map compliance. + +
+
+
+
+
+ + ☆ Label-efficient Contrastive Learning-based model for nuclei detection + and classification in 3D Cardiovascular Immunofluorescent Images MICCAI + + +
+ Recently, deep learning-based methods achieved promising performance in +nuclei detection and classification applications. However, training deep +learning-based methods requires a large amount of pixel-wise annotated data, +which is time-consuming and labor-intensive, especially in 3D images. An +alternative approach is to adapt weak-annotation methods, such as labeling each +nucleus with a point, but this method does not extend from 2D histopathology +images (for which it was originally developed) to 3D immunofluorescent images. +The reason is that 3D images contain multiple channels (z-axis) for nuclei and +different markers separately, which makes training using point annotations +difficult. To address this challenge, we propose the Label-efficient +Contrastive learning-based (LECL) model to detect and classify various types of +nuclei in 3D immunofluorescent images. Previous methods use Maximum Intensity +Projection (MIP) to convert immunofluorescent images with multiple slices to 2D +images, which can cause signals from different z-stacks to falsely appear +associated with each other. To overcome this, we devised an Extended Maximum +Intensity Projection (EMIP) approach that addresses issues using MIP. +Furthermore, we performed a Supervised Contrastive Learning (SCL) approach for +weakly supervised settings. We conducted experiments on cardiovascular datasets +and found that our proposed framework is effective and efficient in detecting +and classifying various types of nuclei in 3D immunofluorescent images. + +
+
+ comment: 11 pages, 5 figures, MICCAI Workshop Conference 2023 +
+
+
+
+
+ + ☆ ClusterFusion: Leveraging Radar Spatial Features for Radar-Camera 3D + Object Detection in Autonomous Vehicles + + +
+ Thanks to the complementary nature of millimeter wave radar and camera, deep +learning-based radar-camera 3D object detection methods may reliably produce +accurate detections even in low-visibility conditions. This makes them +preferable to use in autonomous vehicles' perception systems, especially as the +combined cost of both sensors is cheaper than the cost of a lidar. Recent +radar-camera methods commonly perform feature-level fusion which often involves +projecting the radar points onto the same plane as the image features and +fusing the extracted features from both modalities. While performing fusion on +the image plane is generally simpler and faster, projecting radar points onto +the image plane flattens the depth dimension of the point cloud which might +lead to information loss and makes extracting the spatial features of the point +cloud harder. We proposed ClusterFusion, an architecture that leverages the +local spatial features of the radar point cloud by clustering the point cloud +and performing feature extraction directly on the point cloud clusters before +projecting the features onto the image plane. ClusterFusion achieved the +state-of-the-art performance among all radar-monocular camera methods on the +test slice of the nuScenes dataset with 48.7% nuScenes detection score (NDS). +We also investigated the performance of different radar feature extraction +strategies on point cloud clusters: a handcrafted strategy, a learning-based +strategy, and a combination of both, and found that the handcrafted strategy +yielded the best performance. The main goal of this work is to explore the use +of radar's local spatial and point-wise features by extracting them directly +from radar point cloud clusters for a radar-monocular camera 3D object +detection method that performs cross-modal feature fusion on the image plane. + +
+
+ comment: Submitted to IEEE Access +
+
+
+
+
+ + ☆ Phasic Content Fusing Diffusion Model with Directional Distribution + Consistency for Few-Shot Model Adaption ICCV 2023 + + +
+ Training a generative model with limited number of samples is a challenging +task. Current methods primarily rely on few-shot model adaption to train the +network. However, in scenarios where data is extremely limited (less than 10), +the generative network tends to overfit and suffers from content degradation. +To address these problems, we propose a novel phasic content fusing few-shot +diffusion model with directional distribution consistency loss, which targets +different learning objectives at distinct training stages of the diffusion +model. Specifically, we design a phasic training strategy with phasic content +fusion to help our model learn content and style information when t is large, +and learn local details of target domain when t is small, leading to an +improvement in the capture of content, style and local details. Furthermore, we +introduce a novel directional distribution consistency loss that ensures the +consistency between the generated and source distributions more efficiently and +stably than the prior methods, preventing our model from overfitting. Finally, +we propose a cross-domain structure guidance strategy that enhances structure +consistency during domain adaptation. Theoretical analysis, qualitative and +quantitative experiments demonstrate the superiority of our approach in +few-shot generative model adaption tasks compared to state-of-the-art methods. +The source code is available at: +https://github.com/sjtuplayer/few-shot-diffusion. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Interpretable Visual Question Answering via Reasoning Supervision + + +
+ Transformer-based architectures have recently demonstrated remarkable +performance in the Visual Question Answering (VQA) task. However, such models +are likely to disregard crucial visual cues and often rely on multimodal +shortcuts and inherent biases of the language modality to predict the correct +answer, a phenomenon commonly referred to as lack of visual grounding. In this +work, we alleviate this shortcoming through a novel architecture for visual +question answering that leverages common sense reasoning as a supervisory +signal. Reasoning supervision takes the form of a textual justification of the +correct answer, with such annotations being already available on large-scale +Visual Common Sense Reasoning (VCR) datasets. The model's visual attention is +guided toward important elements of the scene through a similarity loss that +aligns the learned attention distributions guided by the question and the +correct reasoning. We demonstrate both quantitatively and qualitatively that +the proposed approach can boost the model's visual perception capability and +lead to performance increase, without requiring training on explicit grounding +annotations. + +
+
+
+
+
+ + ☆ A boundary-aware point clustering approach in Euclidean and embedding + spaces for roof plane segmentation + + +
+ Roof plane segmentation from airborne LiDAR point clouds is an important +technology for 3D building model reconstruction. One of the key issues of plane +segmentation is how to design powerful features that can exactly distinguish +adjacent planar patches. The quality of point feature directly determines the +accuracy of roof plane segmentation. Most of existing approaches use +handcrafted features to extract roof planes. However, the abilities of these +features are relatively low, especially in boundary area. To solve this +problem, we propose a boundary-aware point clustering approach in Euclidean and +embedding spaces constructed by a multi-task deep network for roof plane +segmentation. We design a three-branch network to predict semantic labels, +point offsets and extract deep embedding features. In the first branch, we +classify the input data as non-roof, boundary and plane points. In the second +branch, we predict point offsets for shifting each point toward its respective +instance center. In the third branch, we constrain that points of the same +plane instance should have the similar embeddings. We aim to ensure that points +of the same plane instance are close as much as possible in both Euclidean and +embedding spaces. However, although deep network has strong feature +representative ability, it is still hard to accurately distinguish points near +plane instance boundary. Therefore, we first group plane points into many +clusters in the two spaces, and then we assign the rest boundary points to +their closest clusters to generate final complete roof planes. In this way, we +can effectively reduce the influence of unreliable boundary points. In +addition, we construct a synthetic dataset and a real dataset to train and +evaluate our approach. The experiments results show that the proposed approach +significantly outperforms the existing state-of-the-art approaches. + +
+
+
+
+
+ + ☆ DiffDefense: Defending against Adversarial Attacks via Diffusion Models + + +
+ This paper presents a novel reconstruction method that leverages Diffusion +Models to protect machine learning classifiers against adversarial attacks, all +without requiring any modifications to the classifiers themselves. The +susceptibility of machine learning models to minor input perturbations renders +them vulnerable to adversarial attacks. While diffusion-based methods are +typically disregarded for adversarial defense due to their slow reverse +process, this paper demonstrates that our proposed method offers robustness +against adversarial threats while preserving clean accuracy, speed, and +plug-and-play compatibility. Code at: +https://github.com/HondamunigePrasannaSilva/DiffDefence. + +
+
+ comment: Paper published at ICIAP23 +
+
+
+
+
+ + ☆ Efficient Adaptive Human-Object Interaction Detection with + Concept-guided Memory + + +
+ Human Object Interaction (HOI) detection aims to localize and infer the +relationships between a human and an object. Arguably, training supervised +models for this task from scratch presents challenges due to the performance +drop over rare classes and the high computational cost and time required to +handle long-tailed distributions of HOIs in complex HOI scenes in realistic +settings. This observation motivates us to design an HOI detector that can be +trained even with long-tailed labeled data and can leverage existing knowledge +from pre-trained models. Inspired by the powerful generalization ability of the +large Vision-Language Models (VLM) on classification and retrieval tasks, we +propose an efficient Adaptive HOI Detector with Concept-guided Memory (ADA-CM). +ADA-CM has two operating modes. The first mode makes it tunable without +learning new parameters in a training-free paradigm. Its second mode +incorporates an instance-aware adapter mechanism that can further efficiently +boost performance if updating a lightweight set of parameters can be afforded. +Our proposed method achieves competitive results with state-of-the-art on the +HICO-DET and V-COCO datasets with much less training time. Code can be found at +https://github.com/ltttpku/ADA-CM. + +
+
+
+
+
+ + ☆ MS-UNet-v2: Adaptive Denoising Method and Training Strategy for Medical + Image Segmentation with Small Training Data + + +
+ Models based on U-like structures have improved the performance of medical +image segmentation. However, the single-layer decoder structure of U-Net is too +"thin" to exploit enough information, resulting in large semantic differences +between the encoder and decoder parts. Things get worse if the number of +training sets of data is not sufficiently large, which is common in medical +image processing tasks where annotated data are more difficult to obtain than +other tasks. Based on this observation, we propose a novel U-Net model named +MS-UNet for the medical image segmentation task in this study. Instead of the +single-layer U-Net decoder structure used in Swin-UNet and TransUnet, we +specifically design a multi-scale nested decoder based on the Swin Transformer +for U-Net. The proposed multi-scale nested decoder structure allows the feature +mapping between the decoder and encoder to be semantically closer, thus +enabling the network to learn more detailed features. In addition, we propose a +novel edge loss and a plug-and-play fine-tuning Denoising module, which not +only effectively improves the segmentation performance of MS-UNet, but could +also be applied to other models individually. Experimental results show that +MS-UNet could effectively improve the network performance with more efficient +feature learning capability and exhibit more advanced performance, especially +in the extreme case with a small amount of training data, and the proposed Edge +loss and Denoising module could significantly enhance the segmentation +performance of MS-UNet. + +
+
+
+
+
+ + ☆ Dataset Generation and Bonobo Classification from Weakly Labelled Videos + + +
+ This paper presents a bonobo detection and classification pipeline built from +the commonly used machine learning methods. Such application is motivated by +the need to test bonobos in their enclosure using touch screen devices without +human assistance. This work introduces a newly acquired dataset based on bonobo +recordings generated semi-automatically. The recordings are weakly labelled and +fed to a macaque detector in order to spatially detect the individual present +in the video. Handcrafted features coupled with different classification +algorithms and deep-learning methods using a ResNet architecture are +investigated for bonobo identification. Performance is compared in terms of +classification accuracy on the splits of the database using different data +separation methods. We demonstrate the importance of data preparation and how a +wrong data separation can lead to false good results. Finally, after a +meaningful separation of the data, the best classification performance is +obtained using a fine-tuned ResNet model and reaches 75% of accuracy. + +
+
+ comment: IntelliSys 2023 paper +
+
+
+
+
+ + ☆ Prompt-based Context- and Domain-aware Pretraining for Vision and + Language Navigation + + +
+ With strong representation capabilities, pretrained vision-language models +are widely used in vision and language navigation (VLN). However, most of them +are trained on web-crawled general-purpose datasets, which incurs a +considerable domain gap when used for VLN tasks. Another challenge for VLN is +how the agent understands the contextual relations between actions on a +trajectory and performs cross-modal alignment sequentially. In this paper, we +propose a novel Prompt-bAsed coNtext- and Domain-Aware (PANDA) pretraining +framework to address these problems. It performs prompting in two stages. In +the domain-aware stage, we apply a low-cost prompt tuning paradigm to learn +soft visual prompts from an in-domain dataset for equipping the pretrained +models with object-level and scene-level cross-modal alignment in VLN tasks. +Furthermore, in the context-aware stage, we design a set of hard context +prompts to capture the sequence-level semantics and instill both out-of-context +and contextual knowledge in the instruction into cross-modal representations. +They enable further tuning of the pretrained models via contrastive learning. +Experimental results on both R2R and REVERIE show the superiority of PANDA +compared to previous state-of-the-art methods. + +
+
+
+
+
+ + ☆ Towards Comparable Knowledge Distillation in Semantic Image Segmentation ECML + + +
+ Knowledge Distillation (KD) is one proposed solution to large model sizes and +slow inference speed in semantic segmentation. In our research we identify 25 +proposed distillation loss terms from 14 publications in the last 4 years. +Unfortunately, a comparison of terms based on published results is often +impossible, because of differences in training configurations. A good +illustration of this problem is the comparison of two publications from 2022. +Using the same models and dataset, Structural and Statistical Texture +Distillation (SSTKD) reports an increase of student mIoU of 4.54 and a final +performance of 29.19, while Adaptive Perspective Distillation (APD) only +improves student performance by 2.06 percentage points, but achieves a final +performance of 39.25. The reason for such extreme differences is often a +suboptimal choice of hyperparameters and a resulting underperformance of the +student model used as reference point. In our work, we reveal problems of +insufficient hyperparameter tuning by showing that distillation improvements of +two widely accepted frameworks, SKD and IFVD, vanish when hyperparameters are +optimized sufficiently. To improve comparability of future research in the +field, we establish a solid baseline for three datasets and two student models +and provide extensive information on hyperparameter tuning. We find that only +two out of eight techniques can compete with our simple baseline on the ADE20K +dataset. + +
+
+ comment: Accepted by the ECML PKDD 2023 workshop track: Simplification, + Compression, Efficiency, and Frugality for Artificial Intelligence (SCEFA). + This preprint has not undergone peer review or any post-submission + improvements or corrections +
+
+
+
+
+ + ☆ Anatomy-informed Data Augmentation for Enhanced Prostate Cancer + Detection MICCAI 2023 + + +
+ Data augmentation (DA) is a key factor in medical image analysis, such as in +prostate cancer (PCa) detection on magnetic resonance images. State-of-the-art +computer-aided diagnosis systems still rely on simplistic spatial +transformations to preserve the pathological label post transformation. +However, such augmentations do not substantially increase the organ as well as +tumor shape variability in the training set, limiting the model's ability to +generalize to unseen cases with more diverse localized soft-tissue +deformations. We propose a new anatomy-informed transformation that leverages +information from adjacent organs to simulate typical physiological deformations +of the prostate and generates unique lesion shapes without altering their +label. Due to its lightweight computational requirements, it can be easily +integrated into common DA frameworks. We demonstrate the effectiveness of our +augmentation on a dataset of 774 biopsy-confirmed examinations, by evaluating a +state-of-the-art method for PCa detection with different augmentation settings. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ☆ Spiking Structured State Space Model for Monaural Speech Enhancement + + +
+ Speech enhancement seeks to extract clean speech from noisy signals. +Traditional deep learning methods face two challenges: efficiently using +information in long speech sequences and high computational costs. To address +these, we introduce the Spiking Structured State Space Model (Spiking-S4). This +approach merges the energy efficiency of Spiking Neural Networks (SNN) with the +long-range sequence modeling capabilities of Structured State Space Models +(S4), offering a compelling solution. Evaluation on the DNS Challenge and +VoiceBank+Demand Datasets confirms that Spiking-S4 rivals existing Artificial +Neural Network (ANN) methods but with fewer computational resources, as +evidenced by reduced parameters and Floating Point Operations (FLOPs). + +
+
+
+
+
+ + ☆ Context-Aware 3D Object Localization from Single Calibrated Images: A + Study of Basketballs + + +
+ Accurately localizing objects in three dimensions (3D) is crucial for various +computer vision applications, such as robotics, autonomous driving, and +augmented reality. This task finds another important application in sports +analytics and, in this work, we present a novel method for 3D basketball +localization from a single calibrated image. Our approach predicts the object's +height in pixels in image space by estimating its projection onto the ground +plane within the image, leveraging the image itself and the object's location +as inputs. The 3D coordinates of the ball are then reconstructed by exploiting +the known projection matrix. Extensive experiments on the public DeepSport +dataset, which provides ground truth annotations for 3D ball location alongside +camera calibration information for each image, demonstrate the effectiveness of +our method, offering substantial accuracy improvements compared to recent work. +Our work opens up new possibilities for enhanced ball tracking and +understanding, advancing computer vision in diverse domains. The source code of +this work is made publicly available at +\url{https://github.com/gabriel-vanzandycke/deepsport}. + +
+
+ comment: 5 pages, 4 figures, MMSports'23, in proceedings of the 6th + International Workshop on Multimedia Content Analysis in Sports (MMSports + '23), October 29, 2023, Ottawa, ON, Canada +
+
+
+
+
+ + ☆ Chasing Consistency in Text-to-3D Generation from a Single Image + + +
+ Text-to-3D generation from a single-view image is a popular but challenging +task in 3D vision. Although numerous methods have been proposed, existing works +still suffer from the inconsistency issues, including 1) semantic +inconsistency, 2) geometric inconsistency, and 3) saturation inconsistency, +resulting in distorted, overfitted, and over-saturated generations. In light of +the above issues, we present Consist3D, a three-stage framework Chasing for +semantic-, geometric-, and saturation-Consistent Text-to-3D generation from a +single image, in which the first two stages aim to learn parameterized +consistency tokens, and the last stage is for optimization. Specifically, the +semantic encoding stage learns a token independent of views and estimations, +promoting semantic consistency and robustness. Meanwhile, the geometric +encoding stage learns another token with comprehensive geometry and +reconstruction constraints under novel-view estimations, reducing overfitting +and encouraging geometric consistency. Finally, the optimization stage benefits +from the semantic and geometric tokens, allowing a low classifier-free guidance +scale and therefore preventing oversaturation. Experimental results demonstrate +that Consist3D produces more consistent, faithful, and photo-realistic 3D +assets compared to previous state-of-the-art methods. Furthermore, Consist3D +also allows background and object editing through text prompts. + +
+
+ comment: 9 pages, 11 figures +
+
+
+
+
+ + ☆ Enhancing Sample Utilization through Sample Adaptive Augmentation in + Semi-Supervised Learning ICCV + + +
+ In semi-supervised learning, unlabeled samples can be utilized through +augmentation and consistency regularization. However, we observed certain +samples, even undergoing strong augmentation, are still correctly classified +with high confidence, resulting in a loss close to zero. It indicates that +these samples have been already learned well and do not provide any additional +optimization benefits to the model. We refer to these samples as ``naive +samples". Unfortunately, existing SSL models overlook the characteristics of +naive samples, and they just apply the same learning strategy to all samples. +To further optimize the SSL model, we emphasize the importance of giving +attention to naive samples and augmenting them in a more diverse manner. Sample +adaptive augmentation (SAA) is proposed for this stated purpose and consists of +two modules: 1) sample selection module; 2) sample augmentation module. +Specifically, the sample selection module picks out {naive samples} based on +historical training information at each epoch, then the naive samples will be +augmented in a more diverse manner in the sample augmentation module. Thanks to +the extreme ease of implementation of the above modules, SAA is advantageous +for being simple and lightweight. We add SAA on top of FixMatch and FlexMatch +respectively, and experiments demonstrate SAA can significantly improve the +models. For example, SAA helped improve the accuracy of FixMatch from 92.50% to +94.76% and that of FlexMatch from 95.01% to 95.31% on CIFAR-10 with 40 labels. + +
+
+ comment: Accepted as International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Spatial encoding of BOLD fMRI time series for categorizing static images + across visual datasets: A pilot study on human vision + + +
+ Functional MRI (fMRI) is widely used to examine brain functionality by +detecting alteration in oxygenated blood flow that arises with brain activity. +In this study, complexity specific image categorization across different visual +datasets is performed using fMRI time series (TS) to understand differences in +neuronal activities related to vision. Publicly available BOLD5000 dataset is +used for this purpose, containing fMRI scans while viewing 5254 images of +diverse categories, drawn from three standard computer vision datasets: COCO, +ImageNet and SUN. To understand vision, it is important to study how brain +functions while looking at different images. To achieve this, spatial encoding +of fMRI BOLD TS has been performed that uses classical Gramian Angular Field +(GAF) and Markov Transition Field (MTF) to obtain 2D BOLD TS, representing +images of COCO, Imagenet and SUN. For classification, individual GAF and MTF +features are fed into regular CNN. Subsequently, parallel CNN model is employed +that uses combined 2D features for classifying images across COCO, Imagenet and +SUN. The result of 2D CNN models is also compared with 1D LSTM and Bi-LSTM that +utilizes raw fMRI BOLD signal for classification. It is seen that parallel CNN +model outperforms other network models with an improvement of 7% for +multi-class classification. Clinical relevance- The obtained result of this +analysis establishes a baseline in studying how differently human brain +functions while looking at images of diverse complexities. + +
+
+ comment: This paper is accepted for publication in IEEE Region 10 Technical + conference, TENCON 2023, to be held in Chiang Mai, Thailand from 31 October - + 3 November, 2023 +
+
+
+
+
+ + ☆ DropPos: Pre-Training Vision Transformers by Reconstructing Dropped + Positions + + +
+ As it is empirically observed that Vision Transformers (ViTs) are quite +insensitive to the order of input tokens, the need for an appropriate +self-supervised pretext task that enhances the location awareness of ViTs is +becoming evident. To address this, we present DropPos, a novel pretext task +designed to reconstruct Dropped Positions. The formulation of DropPos is +simple: we first drop a large random subset of positional embeddings and then +the model classifies the actual position for each non-overlapping patch among +all possible positions solely based on their visual appearance. To avoid +trivial solutions, we increase the difficulty of this task by keeping only a +subset of patches visible. Additionally, considering there may be different +patches with similar visual appearances, we propose position smoothing and +attentive reconstruction strategies to relax this classification problem, since +it is not necessary to reconstruct their exact positions in these cases. +Empirical evaluations of DropPos show strong capabilities. DropPos outperforms +supervised pre-training and achieves competitive results compared with +state-of-the-art self-supervised alternatives on a wide range of downstream +benchmarks. This suggests that explicitly encouraging spatial reasoning +abilities, as DropPos does, indeed contributes to the improved location +awareness of ViTs. The code is publicly available at +https://github.com/Haochen-Wang409/DropPos. + +
+
+
+
+
+ + ☆ Toward High Quality Facial Representation Learning ACM MM 2023 + + +
+ Face analysis tasks have a wide range of applications, but the universal +facial representation has only been explored in a few works. In this paper, we +explore high-performance pre-training methods to boost the face analysis tasks +such as face alignment and face parsing. We propose a self-supervised +pre-training framework, called \textbf{\it Mask Contrastive Face (MCF)}, with +mask image modeling and a contrastive strategy specially adjusted for face +domain tasks. To improve the facial representation quality, we use feature map +of a pre-trained visual backbone as a supervision item and use a partially +pre-trained decoder for mask image modeling. To handle the face identity during +the pre-training stage, we further use random masks to build contrastive +learning pairs. We conduct the pre-training on the LAION-FACE-cropped dataset, +a variants of LAION-FACE 20M, which contains more than 20 million face images +from Internet websites. For efficiency pre-training, we explore our framework +pre-training performance on a small part of LAION-FACE-cropped and verify the +superiority with different pre-training settings. Our model pre-trained with +the full pre-training dataset outperforms the state-of-the-art methods on +multiple downstream tasks. Our model achieves 0.932 NME$_{diag}$ for AFLW-19 +face alignment and 93.96 F1 score for LaPa face parsing. Code is available at +https://github.com/nomewang/MCF. + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ☆ Sparse Federated Training of Object Detection in the Internet of + Vehicles + + +
+ As an essential component part of the Intelligent Transportation System +(ITS), the Internet of Vehicles (IoV) plays a vital role in alleviating traffic +issues. Object detection is one of the key technologies in the IoV, which has +been widely used to provide traffic management services by analyzing timely and +sensitive vehicle-related information. However, the current object detection +methods are mostly based on centralized deep training, that is, the sensitive +data obtained by edge devices need to be uploaded to the server, which raises +privacy concerns. To mitigate such privacy leakage, we first propose a +federated learning-based framework, where well-trained local models are shared +in the central server. However, since edge devices usually have limited +computing power, plus a strict requirement of low latency in IoVs, we further +propose a sparse training process on edge devices, which can effectively +lighten the model, and ensure its training efficiency on edge devices, thereby +reducing communication overheads. In addition, due to the diverse computing +capabilities and dynamic environment, different sparsity rates are applied to +edge devices. To further guarantee the performance, we propose, FedWeg, an +improved aggregation scheme based on FedAvg, which is designed by the inverse +ratio of sparsity rates. Experiments on the real-life dataset using YOLO show +that the proposed scheme can achieve the required object detection rate while +saving considerable communication costs. + +
+
+
+
+
+ + ☆ Region Generation and Assessment Network for Occluded Person + Re-Identification + + +
+ Person Re-identification (ReID) plays a more and more crucial role in recent +years with a wide range of applications. Existing ReID methods are suffering +from the challenges of misalignment and occlusions, which degrade the +performance dramatically. Most methods tackle such challenges by utilizing +external tools to locate body parts or exploiting matching strategies. +Nevertheless, the inevitable domain gap between the datasets utilized for +external tools and the ReID datasets and the complicated matching process make +these methods unreliable and sensitive to noises. In this paper, we propose a +Region Generation and Assessment Network (RGANet) to effectively and +efficiently detect the human body regions and highlight the important regions. +In the proposed RGANet, we first devise a Region Generation Module (RGM) which +utilizes the pre-trained CLIP to locate the human body regions using semantic +prototypes extracted from text descriptions. Learnable prompt is designed to +eliminate domain gap between CLIP datasets and ReID datasets. Then, to measure +the importance of each generated region, we introduce a Region Assessment +Module (RAM) that assigns confidence scores to different regions and reduces +the negative impact of the occlusion regions by lower scores. The RAM consists +of a discrimination-aware indicator and an invariance-aware indicator, where +the former indicates the capability to distinguish from different identities +and the latter represents consistency among the images of the same class of +human body regions. Extensive experimental results for six widely-used +benchmarks including three tasks (occluded, partial, and holistic) demonstrate +the superiority of RGANet against state-of-the-art methods. + +
+
+
+
+
+ + ☆ Text2Control3D: Controllable 3D Avatar Generation in Neural Radiance + Fields using Geometry-Guided Text-to-Image Diffusion Model + + +
+ Recent advances in diffusion models such as ControlNet have enabled +geometrically controllable, high-fidelity text-to-image generation. However, +none of them addresses the question of adding such controllability to +text-to-3D generation. In response, we propose Text2Control3D, a controllable +text-to-3D avatar generation method whose facial expression is controllable +given a monocular video casually captured with hand-held camera. Our main +strategy is to construct the 3D avatar in Neural Radiance Fields (NeRF) +optimized with a set of controlled viewpoint-aware images that we generate from +ControlNet, whose condition input is the depth map extracted from the input +video. When generating the viewpoint-aware images, we utilize cross-reference +attention to inject well-controlled, referential facial expression and +appearance via cross attention. We also conduct low-pass filtering of Gaussian +latent of the diffusion model in order to ameliorate the viewpoint-agnostic +texture problem we observed from our empirical analysis, where the +viewpoint-aware images contain identical textures on identical pixel positions +that are incomprehensible in 3D. Finally, to train NeRF with the images that +are viewpoint-aware yet are not strictly consistent in geometry, our approach +considers per-image geometric variation as a view of deformation from a shared +3D canonical space. Consequently, we construct the 3D avatar in a canonical +space of deformable NeRF by learning a set of per-image deformation via +deformation field table. We demonstrate the empirical results and discuss the +effectiveness of our method. + +
+
+ comment: Project page: https://text2control3d.github.io/ +
+
+
+
+
+ + ☆ Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation + + +
+ Inspired by the remarkable success of Latent Diffusion Models (LDMs) for +image synthesis, we study LDM for text-to-video generation, which is a +formidable challenge due to the computational and memory constraints during +both model training and inference. A single LDM is usually only capable of +generating a very limited number of video frames. Some existing works focus on +separate prediction models for generating more video frames, which suffer from +additional training cost and frame-level jittering, however. In this paper, we +propose a framework called "Reuse and Diffuse" dubbed $\textit{VidRD}$ to +produce more frames following the frames already generated by an LDM. +Conditioned on an initial video clip with a small number of frames, additional +frames are iteratively generated by reusing the original latent features and +following the previous diffusion process. Besides, for the autoencoder used for +translation between pixel space and latent space, we inject temporal layers +into its decoder and fine-tune these layers for higher temporal consistency. We +also propose a set of strategies for composing video-text data that involve +diverse content from multiple existing datasets including video datasets for +action recognition and image-text datasets. Extensive experiments show that our +method achieves good results in both quantitative and qualitative evaluations. +Our project page is available +$\href{https://anonymous0x233.github.io/ReuseAndDiffuse/}{here}$. + +
+
+
+
+
+ + ☆ Trash to Treasure: Low-Light Object Detection via + Decomposition-and-Aggregation + + +
+ Object detection in low-light scenarios has attracted much attention in the +past few years. A mainstream and representative scheme introduces enhancers as +the pre-processing for regular detectors. However, because of the disparity in +task objectives between the enhancer and detector, this paradigm cannot shine +at its best ability. In this work, we try to arouse the potential of enhancer + +detector. Different from existing works, we extend the illumination-based +enhancers (our newly designed or existing) as a scene decomposition module, +whose removed illumination is exploited as the auxiliary in the detector for +extracting detection-friendly features. A semantic aggregation module is +further established for integrating multi-scale scene-related semantic +information in the context space. Actually, our built scheme successfully +transforms the "trash" (i.e., the ignored illumination in the detector) into +the "treasure" for the detector. Plenty of experiments are conducted to reveal +our superiority against other state-of-the-art methods. The code will be public +if it is accepted. + +
+
+
+
+
+ + ☆ Zero-Shot Scene Graph Generation via Triplet Calibration and Reduction + + +
+ Scene Graph Generation (SGG) plays a pivotal role in downstream +vision-language tasks. Existing SGG methods typically suffer from poor +compositional generalizations on unseen triplets. They are generally trained on +incompletely annotated scene graphs that contain dominant triplets and tend to +bias toward these seen triplets during inference. To address this issue, we +propose a Triplet Calibration and Reduction (T-CAR) framework in this paper. In +our framework, a triplet calibration loss is first presented to regularize the +representations of diverse triplets and to simultaneously excavate the unseen +triplets in incompletely annotated training scene graphs. Moreover, the unseen +space of scene graphs is usually several times larger than the seen space since +it contains a huge number of unrealistic compositions. Thus, we propose an +unseen space reduction loss to shift the attention of excavation to reasonable +unseen compositions to facilitate the model training. Finally, we propose a +contextual encoder to improve the compositional generalizations of unseen +triplets by explicitly modeling the relative spatial relations between subjects +and objects. Extensive experiments show that our approach achieves consistent +improvements for zero-shot SGG over state-of-the-art methods. The code is +available at https://github.com/jkli1998/T-CAR. + +
+
+ comment: Accept in TOMM 2023 +
+
+
+
+
+ + ☆ YOLO series target detection algorithms for underwater environments + + +
+ You Only Look Once (YOLO) algorithm is a representative target detection +algorithm emerging in 2016, which is known for its balance of computing speed +and accuracy, and now plays an important role in various fields of human +production and life. However, there are still many limitations in the +application of YOLO algorithm in underwater environments due to problems such +as dim light and turbid water. With limited land area resources, the ocean must +have great potential for future human development. In this paper, starting from +the actual needs of marine engineering applications, taking underwater +structural health monitoring (SHM) and underwater biological detection as +examples, we propose improved methods for the application of underwater YOLO +algorithms, and point out the problems that still exist. + +
+
+
+
+
+ + ☆ Feature Enhancer Segmentation Network (FES-Net) for Vessel Segmentation + + +
+ Diseases such as diabetic retinopathy and age-related macular degeneration +pose a significant risk to vision, highlighting the importance of precise +segmentation of retinal vessels for the tracking and diagnosis of progression. +However, existing vessel segmentation methods that heavily rely on +encoder-decoder structures struggle to capture contextual information about +retinal vessel configurations, leading to challenges in reconciling semantic +disparities between encoder and decoder features. To address this, we propose a +novel feature enhancement segmentation network (FES-Net) that achieves accurate +pixel-wise segmentation without requiring additional image enhancement steps. +FES-Net directly processes the input image and utilizes four prompt +convolutional blocks (PCBs) during downsampling, complemented by a shallow +upsampling approach to generate a binary mask for each class. We evaluate the +performance of FES-Net on four publicly available state-of-the-art datasets: +DRIVE, STARE, CHASE, and HRF. The evaluation results clearly demonstrate the +superior performance of FES-Net compared to other competitive approaches +documented in the existing literature. + +
+
+
+
+
+ + ☆ A Robust Negative Learning Approach to Partial Domain Adaptation Using + Source Prototypes + + +
+ This work proposes a robust Partial Domain Adaptation (PDA) framework that +mitigates the negative transfer problem by incorporating a robust +target-supervision strategy. It leverages ensemble learning and includes +diverse, complementary label feedback, alleviating the effect of incorrect +feedback and promoting pseudo-label refinement. Rather than relying exclusively +on first-order moments for distribution alignment, our approach offers explicit +objectives to optimize intra-class compactness and inter-class separation with +the inferred source prototypes and highly-confident target samples in a +domain-invariant fashion. Notably, we ensure source data privacy by eliminating +the need to access the source data during the adaptation phase through a priori +inference of source prototypes. We conducted a series of comprehensive +experiments, including an ablation analysis, covering a range of partial domain +adaptation tasks. Comprehensive evaluations on benchmark datasets corroborate +our framework's enhanced robustness and generalization, demonstrating its +superiority over existing state-of-the-art PDA approaches. + +
+
+
+
+
+ + ☆ Efficient Single Object Detection on Image Patches with Early Exit + Enhanced High-Precision CNNs + + +
+ This paper proposes a novel approach for detecting objects using mobile +robots in the context of the RoboCup Standard Platform League, with a primary +focus on detecting the ball. The challenge lies in detecting a dynamic object +in varying lighting conditions and blurred images caused by fast movements. To +address this challenge, the paper presents a convolutional neural network +architecture designed specifically for computationally constrained robotic +platforms. The proposed CNN is trained to achieve high precision classification +of single objects in image patches and to determine their precise spatial +positions. The paper further integrates Early Exits into the existing +high-precision CNN architecture to reduce the computational cost of easily +rejectable cases in the background class. The training process involves a +composite loss function based on confidence and positional losses with dynamic +weighting and data augmentation. The proposed approach achieves a precision of +100% on the validation dataset and a recall of almost 87%, while maintaining an +execution time of around 170 $\mu$s per hypotheses. By combining the proposed +approach with an Early Exit, a runtime optimization of more than 28%, on +average, can be achieved compared to the original CNN. Overall, this paper +provides an efficient solution for an enhanced detection of objects, especially +the ball, in computationally constrained robotic platforms. + +
+
+
+
+
+ + ☆ BroadCAM: Outcome-agnostic Class Activation Mapping for Small-scale + Weakly Supervised Applications + + +
+ Class activation mapping~(CAM), a visualization technique for interpreting +deep learning models, is now commonly used for weakly supervised semantic +segmentation~(WSSS) and object localization~(WSOL). It is the weighted +aggregation of the feature maps by activating the high class-relevance ones. +Current CAM methods achieve it relying on the training outcomes, such as +predicted scores~(forward information), gradients~(backward information), etc. +However, when with small-scale data, unstable training may lead to less +effective model outcomes and generate unreliable weights, finally resulting in +incorrect activation and noisy CAM seeds. In this paper, we propose an +outcome-agnostic CAM approach, called BroadCAM, for small-scale weakly +supervised applications. Since broad learning system (BLS) is independent to +the model learning, BroadCAM can avoid the weights being affected by the +unreliable model outcomes when with small-scale data. By evaluating BroadCAM on +VOC2012 (natural images) and BCSS-WSSS (medical images) for WSSS and +OpenImages30k for WSOL, BroadCAM demonstrates superior performance than +existing CAM methods with small-scale data (less than 5\%) in different CNN +architectures. It also achieves SOTA performance with large-scale training +data. Extensive qualitative comparisons are conducted to demonstrate how +BroadCAM activates the high class-relevance feature maps and generates reliable +CAMs when with small-scale training data. + +
+
+
+
+
+ + ☆ Dynamic Frame Interpolation in Wavelet Domain + + +
+ Video frame interpolation is an important low-level vision task, which can +increase frame rate for more fluent visual experience. Existing methods have +achieved great success by employing advanced motion models and synthesis +networks. However, the spatial redundancy when synthesizing the target frame +has not been fully explored, that can result in lots of inefficient +computation. On the other hand, the computation compression degree in frame +interpolation is highly dependent on both texture distribution and scene +motion, which demands to understand the spatial-temporal information of each +input frame pair for a better compression degree selection. In this work, we +propose a novel two-stage frame interpolation framework termed WaveletVFI to +address above problems. It first estimates intermediate optical flow with a +lightweight motion perception network, and then a wavelet synthesis network +uses flow aligned context features to predict multi-scale wavelet coefficients +with sparse convolution for efficient target frame reconstruction, where the +sparse valid masks that control computation in each scale are determined by a +crucial threshold ratio. Instead of setting a fixed value like previous +methods, we find that embedding a classifier in the motion perception network +to learn a dynamic threshold for each sample can achieve more computation +reduction with almost no loss of accuracy. On the common high resolution and +animation frame interpolation benchmarks, proposed WaveletVFI can reduce +computation up to 40% while maintaining similar accuracy, making it perform +more efficiently against other state-of-the-arts. Code is available at +https://github.com/ltkong218/WaveletVFI. + +
+
+ comment: Accepted by IEEE TIP +
+
+
+
+
+ + ☆ Towards Robust Natural-Looking Mammography Lesion Synthesis on + Ipsilateral Dual-Views Breast Cancer Analysis + + +
+ In recent years, many mammographic image analysis methods have been +introduced for improving cancer classification tasks. Two major issues of +mammogram classification tasks are leveraging multi-view mammographic +information and class-imbalance handling. In the first problem, many multi-view +methods have been released for concatenating features of two or more views for +the training and inference stage. Having said that, most multi-view existing +methods are not explainable in the meaning of feature fusion, and treat many +views equally for diagnosing. Our work aims to propose a simple but novel +method for enhancing examined view (main view) by leveraging low-level feature +information from the auxiliary view (ipsilateral view) before learning the +high-level feature that contains the cancerous features. For the second issue, +we also propose a simple but novel malignant mammogram synthesis framework for +upsampling minor class samples. Our easy-to-implement and no-training framework +has eliminated the current limitation of the CutMix algorithm which is +unreliable synthesized images with random pasted patches, hard-contour +problems, and domain shift problems. Our results on VinDr-Mammo and CMMD +datasets show the effectiveness of our two new frameworks for both multi-view +training and synthesizing mammographic images, outperforming the previous +conventional methods in our experimental settings. + +
+
+
+
+
+ + ☆ Stroke-based Neural Painting and Stylization with Dynamically Predicted + Painting Region ACM MM 2023 + + +
+ Stroke-based rendering aims to recreate an image with a set of strokes. Most +existing methods render complex images using an uniform-block-dividing +strategy, which leads to boundary inconsistency artifacts. To solve the +problem, we propose Compositional Neural Painter, a novel stroke-based +rendering framework which dynamically predicts the next painting region based +on the current canvas, instead of dividing the image plane uniformly into +painting regions. We start from an empty canvas and divide the painting process +into several steps. At each step, a compositor network trained with a phasic RL +strategy first predicts the next painting region, then a painter network +trained with a WGAN discriminator predicts stroke parameters, and a stroke +renderer paints the strokes onto the painting region of the current canvas. +Moreover, we extend our method to stroke-based style transfer with a novel +differentiable distance transform loss, which helps preserve the structure of +the input image during stroke-based stylization. Extensive experiments show our +model outperforms the existing models in both stroke-based neural painting and +stroke-based stylization. Code is available at +https://github.com/sjtuplayer/Compositional_Neural_Painter + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ☆ Instance Segmentation of Dislocations in TEM Images + + +
+ Quantitative Transmission Electron Microscopy (TEM) during in-situ straining +experiment is able to reveal the motion of dislocations -- linear defects in +the crystal lattice of metals. In the domain of materials science, the +knowledge about the location and movement of dislocations is important for +creating novel materials with superior properties. A long-standing problem, +however, is to identify the position and extract the shape of dislocations, +which would ultimately help to create a digital twin of such materials. In this +work, we quantitatively compare state-of-the-art instance segmentation methods, +including Mask R-CNN and YOLOv8. The dislocation masks as the results of the +instance segmentation are converted to mathematical lines, enabling +quantitative analysis of dislocation length and geometry -- important +information for the domain scientist, which we then propose to include as a +novel length-aware quality metric for estimating the network performance. Our +segmentation pipeline shows a high accuracy suitable for all domain-specific, +further post-processing. Additionally, our physics-based metric turns out to +perform much more consistently than typically used pixel-wise metrics. + +
+
+
+
+
+ + ☆ Evaluating Deep Learning-based Melanoma Classification using + Immunohistochemistry and Routine Histology: A Three Center Study + + +
+ Pathologists routinely use immunohistochemical (IHC)-stained tissue slides +against MelanA in addition to hematoxylin and eosin (H&E)-stained slides to +improve their accuracy in diagnosing melanomas. The use of diagnostic Deep +Learning (DL)-based support systems for automated examination of tissue +morphology and cellular composition has been well studied in standard +H&E-stained tissue slides. In contrast, there are few studies that analyze IHC +slides using DL. Therefore, we investigated the separate and joint performance +of ResNets trained on MelanA and corresponding H&E-stained slides. The MelanA +classifier achieved an area under receiver operating characteristics curve +(AUROC) of 0.82 and 0.74 on out of distribution (OOD)-datasets, similar to the +H&E-based benchmark classification of 0.81 and 0.75, respectively. A combined +classifier using MelanA and H&E achieved AUROCs of 0.85 and 0.81 on the OOD +datasets. DL MelanA-based assistance systems show the same performance as the +benchmark H&E classification and may be improved by multi stain classification +to assist pathologists in their clinical routine. + +
+
+
+
+
+ + ☆ SAM3D: Segment Anything Model in Volumetric Medical Images + + +
+ Image segmentation is a critical task in medical image analysis, providing +valuable information that helps to make an accurate diagnosis. In recent years, +deep learning-based automatic image segmentation methods have achieved +outstanding results in medical images. In this paper, inspired by the Segment +Anything Model (SAM), a foundation model that has received much attention for +its impressive accuracy and powerful generalization ability in 2D still image +segmentation, we propose a SAM3D that targets at 3D volumetric medical images +and utilizes the pre-trained features from the SAM encoder to capture +meaningful representations of input images. Different from other existing +SAM-based volumetric segmentation methods that perform the segmentation by +dividing the volume into a set of 2D slices, our model takes the whole 3D +volume image as input and processes it simply and effectively that avoids +training a significant number of parameters. Extensive experiments are +conducted on multiple medical image datasets to demonstrate that our network +attains competitive results compared with other state-of-the-art methods in 3D +medical segmentation tasks while being significantly efficient in terms of +parameters. + +
+
+
+
+
+ + ☆ DetermiNet: A Large-Scale Diagnostic Dataset for Complex + Visually-Grounded Referencing using Determiners + + +
+ State-of-the-art visual grounding models can achieve high detection accuracy, +but they are not designed to distinguish between all objects versus only +certain objects of interest. In natural language, in order to specify a +particular object or set of objects of interest, humans use determiners such as +"my", "either" and "those". Determiners, as an important word class, are a type +of schema in natural language about the reference or quantity of the noun. +Existing grounded referencing datasets place much less emphasis on determiners, +compared to other word classes such as nouns, verbs and adjectives. This makes +it difficult to develop models that understand the full variety and complexity +of object referencing. Thus, we have developed and released the DetermiNet +dataset , which comprises 250,000 synthetically generated images and captions +based on 25 determiners. The task is to predict bounding boxes to identify +objects of interest, constrained by the semantics of the given determiner. We +find that current state-of-the-art visual grounding models do not perform well +on the dataset, highlighting the limitations of existing models on reference +and quantification tasks. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ TSI-Net: A Timing Sequence Image Segmentation Network for Intracranial + Artery Segmentation in Digital Subtraction Angiography + + +
+ Cerebrovascular disease is one of the major diseases facing the world today. +Automatic segmentation of intracranial artery (IA) in digital subtraction +angiography (DSA) sequences is an important step in the diagnosis of vascular +related diseases and in guiding neurointerventional procedures. While, a single +image can only show part of the IA within the contrast medium according to the +imaging principle of DSA technology. Therefore, 2D DSA segmentation methods are +unable to capture the complete IA information and treatment of cerebrovascular +diseases. We propose A timing sequence image segmentation network with U-shape, +called TSI-Net, which incorporates a bi-directional ConvGRU module (BCM) in the +encoder. The network incorporates a bi-directional ConvGRU module (BCM) in the +encoder, which can input variable-length DSA sequences, retain past and future +information, segment them into 2D images. In addition, we introduce a sensitive +detail branch (SDB) at the end for supervising fine vessels. Experimented on +the DSA sequence dataset DIAS, the method performs significantly better than +state-of-the-art networks in recent years. In particular, it achieves a Sen +evaluation metric of 0.797, which is a 3% improvement compared to other +methods. + +
+
+
+
+
+ + ☆ Temporal Collection and Distribution for Referring Video Object + Segmentation ICCV 2023 + + +
+ Referring video object segmentation aims to segment a referent throughout a +video sequence according to a natural language expression. It requires aligning +the natural language expression with the objects' motions and their dynamic +associations at the global video level but segmenting objects at the frame +level. To achieve this goal, we propose to simultaneously maintain a global +referent token and a sequence of object queries, where the former is +responsible for capturing video-level referent according to the language +expression, while the latter serves to better locate and segment objects with +each frame. Furthermore, to explicitly capture object motions and +spatial-temporal cross-modal reasoning over objects, we propose a novel +temporal collection-distribution mechanism for interacting between the global +referent token and object queries. Specifically, the temporal collection +mechanism collects global information for the referent token from object +queries to the temporal motions to the language expression. In turn, the +temporal distribution first distributes the referent token to the referent +sequence across all frames and then performs efficient cross-frame reasoning +between the referent sequence and object queries in every frame. Experimental +results show that our method outperforms state-of-the-art methods on all +benchmarks consistently and significantly. + +
+
+ comment: Accepted by ICCV 2023; Project page: + https://toneyaya.github.io/tempcd/ +
+
+
+
+
+ + ☆ Perceptual Quality Assessment of 360$^\circ$ Images Based on Generative + Scanpath Representation + + +
+ Despite substantial efforts dedicated to the design of heuristic models for +omnidirectional (i.e., 360$^\circ$) image quality assessment (OIQA), a +conspicuous gap remains due to the lack of consideration for the diversity of +viewing behaviors that leads to the varying perceptual quality of 360$^\circ$ +images. Two critical aspects underline this oversight: the neglect of viewing +conditions that significantly sway user gaze patterns and the overreliance on a +single viewport sequence from the 360$^\circ$ image for quality inference. To +address these issues, we introduce a unique generative scanpath representation +(GSR) for effective quality inference of 360$^\circ$ images, which aggregates +varied perceptual experiences of multi-hypothesis users under a predefined +viewing condition. More specifically, given a viewing condition characterized +by the starting point of viewing and exploration time, a set of scanpaths +consisting of dynamic visual fixations can be produced using an apt scanpath +generator. Following this vein, we use the scanpaths to convert the 360$^\circ$ +image into the unique GSR, which provides a global overview of gazed-focused +contents derived from scanpaths. As such, the quality inference of the +360$^\circ$ image is swiftly transformed to that of GSR. We then propose an +efficient OIQA computational framework by learning the quality maps of GSR. +Comprehensive experimental results validate that the predictions of the +proposed framework are highly consistent with human perception in the +spatiotemporal domain, especially in the challenging context of locally +distorted 360$^\circ$ images under varied viewing conditions. The code will be +released at https://github.com/xiangjieSui/GSR + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Fast FixMatch: Faster Semi-Supervised Learning with Curriculum Batch + Size + + +
+ Advances in Semi-Supervised Learning (SSL) have almost entirely closed the +gap between SSL and Supervised Learning at a fraction of the number of labels. +However, recent performance improvements have often come \textit{at the cost of +significantly increased training computation}. To address this, we propose +Curriculum Batch Size (CBS), \textit{an unlabeled batch size curriculum which +exploits the natural training dynamics of deep neural networks.} A small +unlabeled batch size is used in the beginning of training and is gradually +increased to the end of training. A fixed curriculum is used regardless of +dataset, model or number of epochs, and reduced training computations is +demonstrated on all settings. We apply CBS, strong labeled augmentation, +Curriculum Pseudo Labeling (CPL) \citep{FlexMatch} to FixMatch \citep{FixMatch} +and term the new SSL algorithm Fast FixMatch. We perform an ablation study to +show that strong labeled augmentation and/or CPL do not significantly reduce +training computations, but, in synergy with CBS, they achieve optimal +performance. Fast FixMatch also achieves substantially higher data utilization +compared to previous state-of-the-art. Fast FixMatch achieves between +$2.1\times$ - $3.4\times$ reduced training computations on CIFAR-10 with all +but 40, 250 and 4000 labels removed, compared to vanilla FixMatch, while +attaining the same cited state-of-the-art error rate \citep{FixMatch}. Similar +results are achieved for CIFAR-100, SVHN and STL-10. Finally, Fast MixMatch +achieves between $2.6\times$ - $3.3\times$ reduced training computations in +federated SSL tasks and online/streaming learning SSL tasks, which further +demonstrate the generializbility of Fast MixMatch to different scenarios and +tasks. + +
+
+
+
+
+ + ☆ Cross-Image Context Matters for Bongard Problems + + +
+ Current machine learning methods struggle to solve Bongard problems, which +are a type of IQ test that requires deriving an abstract "concept" from a set +of positive and negative "support" images, and then classifying whether or not +a new query image depicts the key concept. On Bongard-HOI, a benchmark for +natural-image Bongard problems, existing methods have only reached 66% accuracy +(where chance is 50%). Low accuracy is often attributed to neural nets' lack of +ability to find human-like symbolic rules. In this work, we point out that many +existing methods are forfeiting accuracy due to a much simpler problem: they do +not incorporate information contained in the support set as a whole, and rely +instead on information extracted from individual supports. This is a critical +issue, because unlike in few-shot learning tasks concerning object +classification, the "key concept" in a typical Bongard problem can only be +distinguished using multiple positives and multiple negatives. We explore a +variety of simple methods to take this cross-image context into account, and +demonstrate substantial gains over prior methods, leading to new +state-of-the-art performance on Bongard-LOGO (75.3%) and Bongard-HOI (72.45%) +and strong performance on the original Bongard problem set (60.84%). + +
+
+ comment: Main paper: 7 pages, Appendix: 10 pages, 30 figures. Code: + https://github.com/nraghuraman/bongard-context +
+
+
+
+
+ + ☆ Autoregressive Omni-Aware Outpainting for Open-Vocabulary 360-Degree + Image Generation + + +
+ A 360-degree (omni-directional) image provides an all-encompassing spherical +view of a scene. Recently, there has been an increasing interest in +synthesising 360-degree images from conventional narrow field of view (NFoV) +images captured by digital cameras and smartphones, for providing immersive +experiences in various scenarios such as virtual reality. Yet, existing methods +typically fall short in synthesizing intricate visual details or ensure the +generated images align consistently with user-provided prompts. In this study, +autoregressive omni-aware generative network (AOG-Net) is proposed for +360-degree image generation by out-painting an incomplete 360-degree image +progressively with NFoV and text guidances joinly or individually. This +autoregressive scheme not only allows for deriving finer-grained and +text-consistent patterns by dynamically generating and adjusting the process +but also offers users greater flexibility to edit their conditions throughout +the generation process. A global-local conditioning mechanism is devised to +comprehensively formulate the outpainting guidance in each autoregressive step. +Text guidances, omni-visual cues, NFoV inputs and omni-geometry are encoded and +further formulated with cross-attention based transformers into a global stream +and a local stream into a conditioned generative backbone model. As AOG-Net is +compatible to leverage large-scale models for the conditional encoder and the +generative prior, it enables the generation to use extensive open-vocabulary +text guidances. Comprehensive experiments on two commonly used 360-degree image +datasets for both indoor and outdoor settings demonstrate the state-of-the-art +performance of our proposed method. Our code will be made publicly available. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ SyncDreamer: Generating Multiview-consistent Images from a Single-view + Image + + +
+ In this paper, we present a novel diffusion model called that generates +multiview-consistent images from a single-view image. Using pretrained +large-scale 2D diffusion models, recent work Zero123 demonstrates the ability +to generate plausible novel views from a single-view image of an object. +However, maintaining consistency in geometry and colors for the generated +images remains a challenge. To address this issue, we propose a synchronized +multiview diffusion model that models the joint probability distribution of +multiview images, enabling the generation of multiview-consistent images in a +single reverse process. SyncDreamer synchronizes the intermediate states of all +the generated images at every step of the reverse process through a 3D-aware +feature attention mechanism that correlates the corresponding features across +different views. Experiments show that SyncDreamer generates images with high +consistency across different views, thus making it well-suited for various 3D +generation tasks such as novel-view-synthesis, text-to-3D, and image-to-3D. + +
+
+ comment: Project page: https://liuyuan-pal.github.io/SyncDreamer/ +
+
+
+
+
+ + ☆ Multi-Modality Guidance Network For Missing Modality Inference + + +
+ Multimodal models have gained significant success in recent years. Standard +multimodal approaches often assume unchanged modalities from training stage to +inference stage. In practice, however, many scenarios fail to satisfy such +assumptions with missing modalities during inference, leading to limitations on +where multimodal models can be applied. While existing methods mitigate the +problem through reconstructing the missing modalities, it increases unnecessary +computational cost, which could be just as critical, especially for large, +deployed systems. To solve the problem from both sides, we propose a novel +guidance network that promotes knowledge sharing during training, taking +advantage of the multimodal representations to train better single-modality +models for inference. Real-life experiment in violence detection shows that our +proposed framework trains single-modality models that significantly outperform +its traditionally trained counterparts while maintaining the same inference +cost. + +
+
+
+
+
+ + ☆ Underwater Image Enhancement by Transformer-based Diffusion Model with + Non-uniform Sampling for Skip Strategy + + +
+ In this paper, we present an approach to image enhancement with diffusion +model in underwater scenes. Our method adapts conditional denoising diffusion +probabilistic models to generate the corresponding enhanced images by using the +underwater images and the Gaussian noise as the inputs. Additionally, in order +to improve the efficiency of the reverse process in the diffusion model, we +adopt two different ways. We firstly propose a lightweight transformer-based +denoising network, which can effectively promote the time of network forward +per iteration. On the other hand, we introduce a skip sampling strategy to +reduce the number of iterations. Besides, based on the skip sampling strategy, +we propose two different non-uniform sampling methods for the sequence of the +time step, namely piecewise sampling and searching with the evolutionary +algorithm. Both of them are effective and can further improve performance by +using the same steps against the previous uniform sampling. In the end, we +conduct a relative evaluation of the widely used underwater enhancement +datasets between the recent state-of-the-art methods and the proposed approach. +The experimental results prove that our approach can achieve both competitive +performance and high efficiency. Our code is available at +\href{mailto:https://github.com/piggy2009/DM_underwater}{\color{blue}{https://github.com/piggy2009/DM\_underwater}}. + +
+
+
+
+
+ + ☆ Punctate White Matter Lesion Segmentation in Preterm Infants Powered by + Counterfactually Generative Learning MICCAI + + +
+ Accurate segmentation of punctate white matter lesions (PWMLs) are +fundamental for the timely diagnosis and treatment of related developmental +disorders. Automated PWMLs segmentation from infant brain MR images is +challenging, considering that the lesions are typically small and low-contrast, +and the number of lesions may dramatically change across subjects. Existing +learning-based methods directly apply general network architectures to this +challenging task, which may fail to capture detailed positional information of +PWMLs, potentially leading to severe under-segmentations. In this paper, we +propose to leverage the idea of counterfactual reasoning coupled with the +auxiliary task of brain tissue segmentation to learn fine-grained positional +and morphological representations of PWMLs for accurate localization and +segmentation. A simple and easy-to-implement deep-learning framework (i.e., +DeepPWML) is accordingly designed. It combines the lesion counterfactual map +with the tissue probability map to train a lightweight PWML segmentation +network, demonstrating state-of-the-art performance on a real-clinical dataset +of infant T1w MR images. The code is available at +\href{https://github.com/ladderlab-xjtu/DeepPWML}{https://github.com/ladderlab-xjtu/DeepPWML}. + +
+
+ comment: 10 pages, 3 figures, Medical Image Computing and Computer Assisted + Intervention(MICCAI) +
+
+
+
+
+ + ♻ ☆ Non-inferiority of Deep Learning Acute Ischemic Stroke Segmentation on + Non-Contrast CT Compared to Expert Neuroradiologists + + +
+ To determine if a convolutional neural network (CNN) deep learning model can +accurately segment acute ischemic changes on non-contrast CT compared to +neuroradiologists. Non-contrast CT (NCCT) examinations from 232 acute ischemic +stroke patients who were enrolled in the DEFUSE 3 trial were included in this +study. Three experienced neuroradiologists independently segmented hypodensity +that reflected the ischemic core on each scan. The neuroradiologist with the +most experience (expert A) served as the ground truth for deep learning model +training. Two additional neuroradiologists (experts B and C) segmentations were +used for data testing. The 232 studies were randomly split into training and +test sets. The training set was further randomly divided into 5 folds with +training and validation sets. A 3-dimensional CNN architecture was trained and +optimized to predict the segmentations of expert A from NCCT. The performance +of the model was assessed using a set of volume, overlap, and distance metrics +using non-inferiority thresholds of 20%, 3ml, and 3mm. The optimized model +trained on expert A was compared to test experts B and C. We used a one-sided +Wilcoxon signed-rank test to test for the non-inferiority of the model-expert +compared to the inter-expert agreement. The final model performance for the +ischemic core segmentation task reached a performance of 0.46+-0.09 Surface +Dice at Tolerance 5mm and 0.47+-0.13 Dice when trained on expert A. Compared to +the two test neuroradiologists the model-expert agreement was non-inferior to +the inter-expert agreement, p < 0.05. The CNN accurately delineates the +hypodense ischemic core on NCCT in acute ischemic stroke patients with an +accuracy comparable to neuroradiologists. + +
+
+
+
+
+ + ♻ ☆ USE-Evaluator: Performance Metrics for Medical Image Segmentation Models + with Uncertain, Small or Empty Reference Annotations + + +
+ Performance metrics for medical image segmentation models are used to measure +the agreement between the reference annotation and the predicted segmentation. +Usually, overlap metrics, such as the Dice, are used as a metric to evaluate +the performance of these models in order for results to be comparable. However, +there is a mismatch between the distributions of cases and difficulty level of +segmentation tasks in public data sets compared to clinical practice. Common +metrics fail to measure the impact of this mismatch, especially for clinical +data sets that include low signal pathologies, a difficult segmentation task, +and uncertain, small, or empty reference annotations. This limitation may +result in ineffective research of machine learning practitioners in designing +and optimizing models. Dimensions of evaluating clinical value include +consideration of the uncertainty of reference annotations, independence from +reference annotation volume size, and evaluation of classification of empty +reference annotations. We study how uncertain, small, and empty reference +annotations influence the value of metrics for medical image segmentation on an +in-house data set regardless of the model. We examine metrics behavior on the +predictions of a standard deep learning framework in order to identify metrics +with clinical value. We compare to a public benchmark data set (BraTS 2019) +with a high-signal pathology and certain, larger, and no empty reference +annotations. We may show machine learning practitioners, how uncertain, small, +or empty reference annotations require a rethinking of the evaluation and +optimizing procedures. The evaluation code was released to encourage further +analysis of this topic. +https://github.com/SophieOstmeier/UncertainSmallEmpty.git + +
+
+ comment: 16 pages, 10 figures, Published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Domain Generalization for Mammographic Image Analysis with Contrastive + Learning + + +
+ The deep learning technique has been shown to be effectively addressed +several image analysis tasks in the computer-aided diagnosis scheme for +mammography. The training of an efficacious deep learning model requires large +data with diverse styles and qualities. The diversity of data often comes from +the use of various scanners of vendors. But, in practice, it is impractical to +collect a sufficient amount of diverse data for training. To this end, a novel +contrastive learning is developed to equip the deep learning models with better +style generalization capability. Specifically, the multi-style and multi-view +unsupervised self-learning scheme is carried out to seek robust feature +embedding against style diversity as a pretrained model. Afterward, the +pretrained network is further fine-tuned to the downstream tasks, e.g., mass +detection, matching, BI-RADS rating, and breast density classification. The +proposed method has been evaluated extensively and rigorously with mammograms +from various vendor style domains and several public datasets. The experimental +results suggest that the proposed domain generalization method can effectively +improve performance of four mammographic image tasks on the data from both seen +and unseen domains, and outperform many state-of-the-art (SOTA) generalization +methods. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2111.10827 +
+
+
+
+
+ + ♻ ☆ PGFed: Personalize Each Client's Global Objective for Federated Learning ICCV 2023 + + +
+ Personalized federated learning has received an upsurge of attention due to +the mediocre performance of conventional federated learning (FL) over +heterogeneous data. Unlike conventional FL which trains a single global +consensus model, personalized FL allows different models for different clients. +However, existing personalized FL algorithms only implicitly transfer the +collaborative knowledge across the federation by embedding the knowledge into +the aggregated model or regularization. We observed that this implicit +knowledge transfer fails to maximize the potential of each client's empirical +risk toward other clients. Based on our observation, in this work, we propose +Personalized Global Federated Learning (PGFed), a novel personalized FL +framework that enables each client to personalize its own global objective by +explicitly and adaptively aggregating the empirical risks of itself and other +clients. To avoid massive (O(N^2)) communication overhead and potential privacy +leakage while achieving this, each client's risk is estimated through a +first-order approximation for other clients' adaptive risk aggregation. On top +of PGFed, we develop a momentum upgrade, dubbed PGFedMo, to more efficiently +utilize clients' empirical risks. Our extensive experiments on four datasets +under different federated settings show consistent improvements of PGFed over +previous state-of-the-art methods. The code is publicly available at +https://github.com/ljaiverson/pgfed. + +
+
+ comment: ICCV 2023 oral +
+
+
+
+
+ + ♻ ☆ Stain-invariant self supervised learning for histopathology image + analysis + + +
+ We present a self-supervised algorithm for several classification tasks +within hematoxylin and eosin (H&E) stained images of breast cancer. Our method +is robust to stain variations inherent to the histology images acquisition +process, which has limited the applicability of automated analysis tools. We +address this problem by imposing constraints a learnt latent space which +leverages stain normalization techniques during training. At every iteration, +we select an image as a normalization target and generate a version of every +image in the batch normalized to that target. We minimize the distance between +the embeddings that correspond to the same image under different staining +variations while maximizing the distance between other samples. We show that +our method not only improves robustness to stain variations across multi-center +data, but also classification performance through extensive experiments on +various normalization targets and methods. Our method achieves the +state-of-the-art performance on several publicly available breast cancer +datasets ranging from tumor classification (CAMELYON17) and subtyping (BRACS) +to HER2 status classification and treatment response prediction. + +
+
+
+
+
+ + ♻ ☆ Deep Video Codec Control + + +
+ Lossy video compression is commonly used when transmitting and storing video +data. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard, +despite the availability of advanced (neural) compression approaches. +Transmitting videos in the face of dynamic network bandwidth conditions +requires video codecs to adapt to vastly different compression strengths. Rate +control modules augment the codec's compression such that bandwidth constraints +are satisfied and video distortion is minimized. While, both standard video +codes and their rate control modules are developed to minimize video distortion +w.r.t. human quality assessment, preserving the downstream performance of deep +vision models is not considered. In this paper, we present the first end-to-end +learnable deep video codec control considering both bandwidth constraints and +downstream vision performance, while not breaking existing standardization. We +demonstrate for two common vision tasks (semantic segmentation and optical flow +estimation) and on two different datasets that our deep codec control better +preserves downstream performance than using 2-pass average bit rate control +while meeting dynamic bandwidth constraints and adhering to standardizations. + +
+
+ comment: 22 pages, 26 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Automotive Object Detection via Learning Sparse Events by Spiking + Neurons + + +
+ Event-based sensors, distinguished by their high temporal resolution of 1 +{\mu}s and a dynamic range of 120 dB, stand out as ideal tools for deployment +in fast-paced settings like vehicles and drones. Traditional object detection +techniques that utilize Artificial Neural Networks (ANNs) face challenges due +to the sparse and asynchronous nature of the events these sensors capture. In +contrast, Spiking Neural Networks (SNNs) offer a promising alternative, +providing a temporal representation that is inherently aligned with event-based +data. This paper explores the unique membrane potential dynamics of SNNs and +their ability to modulate sparse events. We introduce an innovative +spike-triggered adaptive threshold mechanism designed for stable training. +Building on these insights, we present a specialized spiking feature pyramid +network (SpikeFPN) optimized for automotive event based object detection. +Comprehensive evaluations demonstrate that SpikeFPN surpasses both traditional +SNNs and advanced ANNs enhanced with attention mechanisms. Evidently, SpikeFPN +achieves a mean Average Precision (mAP) of 0.477 on the GEN1 Automotive +Detection (GAD) benchmark dataset, marking a significant increase of 9.7% over +the previous best SNN. Moreover, the efficient design of SpikeFPN ensures +robust performance while optimizing computational resources, attributed to its +innate sparse computation capabilities. + +
+
+
+
+
+ + ♻ ☆ Adaptive Similarity Bootstrapping for Self-Distillation based + Representation Learning ICCV 2023 + + +
+ Most self-supervised methods for representation learning leverage a +cross-view consistency objective i.e., they maximize the representation +similarity of a given image's augmented views. Recent work NNCLR goes beyond +the cross-view paradigm and uses positive pairs from different images obtained +via nearest neighbor bootstrapping in a contrastive setting. We empirically +show that as opposed to the contrastive learning setting which relies on +negative samples, incorporating nearest neighbor bootstrapping in a +self-distillation scheme can lead to a performance drop or even collapse. We +scrutinize the reason for this unexpected behavior and provide a solution. We +propose to adaptively bootstrap neighbors based on the estimated quality of the +latent space. We report consistent improvements compared to the naive +bootstrapping approach and the original baselines. Our approach leads to +performance improvements for various self-distillation method/backbone +combinations and standard downstream tasks. Our code is publicly available at +https://github.com/tileb1/AdaSim. + +
+
+ comment: ICCV 2023. * denotes equal contribution +
+
+
+
+
+ + ♻ ☆ Generative-based Fusion Mechanism for Multi-Modal Tracking + + +
+ Generative models (GMs) have received increasing research interest for their +remarkable capacity to achieve comprehensive understanding. However, their +potential application in the domain of multi-modal tracking has remained +relatively unexplored. In this context, we seek to uncover the potential of +harnessing generative techniques to address the critical challenge, information +fusion, in multi-modal tracking. In this paper, we delve into two prominent GM +techniques, namely, Conditional Generative Adversarial Networks (CGANs) and +Diffusion Models (DMs). Different from the standard fusion process where the +features from each modality are directly fed into the fusion block, we +condition these multi-modal features with random noise in the GM framework, +effectively transforming the original training samples into harder instances. +This design excels at extracting discriminative clues from the features, +enhancing the ultimate tracking performance. To quantitatively gauge the +effectiveness of our approach, we conduct extensive experiments across two +multi-modal tracking tasks, three baseline methods, and three challenging +benchmarks. The experimental results demonstrate that the proposed +generative-based fusion mechanism achieves state-of-the-art performance, +setting new records on LasHeR and RGBD1K. + +
+
+ comment: 10 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Revisiting Hidden Representations in Transfer Learning for Medical + Imaging + + +
+ While a key component to the success of deep learning is the availability of +massive amounts of training data, medical image datasets are often limited in +diversity and size. Transfer learning has the potential to bridge the gap +between related yet different domains. For medical applications, however, it +remains unclear whether it is more beneficial to pre-train on natural or +medical images. We aim to shed light on this problem by comparing +initialization on ImageNet and RadImageNet on seven medical classification +tasks. Our work includes a replication study, which yields results contrary to +previously published findings. In our experiments, ResNet50 models pre-trained +on ImageNet tend to outperform those trained on RadImageNet. To gain further +insights, we investigate the learned representations using Canonical +Correlation Analysis (CCA) and compare the predictions of the different models. +Our results indicate that, contrary to intuition, ImageNet and RadImageNet may +converge to distinct intermediate representations, which appear to diverge +further during fine-tuning. Despite these distinct representations, the +predictions of the models remain similar. Our findings show that the similarity +between networks before and after fine-tuning does not correlate with +performance gains, suggesting that the advantages of transfer learning might +not solely originate from the reuse of features in the early layers of a +convolutional neural network. + +
+
+ comment: Submitted to TMLR +
+
+
+
+
+ + ♻ ☆ DreamEditor: Text-Driven 3D Scene Editing with Neural Fields SIGGRAPH + + +
+ Neural fields have achieved impressive advancements in view synthesis and +scene reconstruction. However, editing these neural fields remains challenging +due to the implicit encoding of geometry and texture information. In this +paper, we propose DreamEditor, a novel framework that enables users to perform +controlled editing of neural fields using text prompts. By representing scenes +as mesh-based neural fields, DreamEditor allows localized editing within +specific regions. DreamEditor utilizes the text encoder of a pretrained +text-to-Image diffusion model to automatically identify the regions to be +edited based on the semantics of the text prompts. Subsequently, DreamEditor +optimizes the editing region and aligns its geometry and texture with the text +prompts through score distillation sampling [29]. Extensive experiments have +demonstrated that DreamEditor can accurately edit neural fields of real-world +scenes according to the given text prompts while ensuring consistency in +irrelevant areas. DreamEditor generates highly realistic textures and geometry, +significantly surpassing previous works in both quantitative and qualitative +evaluations. + +
+
+ comment: Accepted by SIGGRAPH Asia 2023 +
+
+
+
+
+ + ♻ ☆ LDMRes-Net: Enabling Efficient Medical Image Segmentation on IoT and + Edge Platforms + + +
+ In this study, we propose LDMRes-Net, a lightweight dual-multiscale residual +block-based computational neural network tailored for medical image +segmentation on IoT and edge platforms. Conventional U-Net-based models face +challenges in meeting the speed and efficiency demands of real-time clinical +applications, such as disease monitoring, radiation therapy, and image-guided +surgery. LDMRes-Net overcomes these limitations with its remarkably low number +of learnable parameters (0.072M), making it highly suitable for +resource-constrained devices. The model's key innovation lies in its dual +multi-residual block architecture, which enables the extraction of refined +features on multiple scales, enhancing overall segmentation performance. To +further optimize efficiency, the number of filters is carefully selected to +prevent overlap, reduce training time, and improve computational efficiency. +The study includes comprehensive evaluations, focusing on segmentation of the +retinal image of vessels and hard exudates crucial for the diagnosis and +treatment of ophthalmology. The results demonstrate the robustness, +generalizability, and high segmentation accuracy of LDMRes-Net, positioning it +as an efficient tool for accurate and rapid medical image segmentation in +diverse clinical applications, particularly on IoT and edge platforms. Such +advances hold significant promise for improving healthcare outcomes and +enabling real-time medical image analysis in resource-limited settings. + +
+
+
+
+
+ + ♻ ☆ Improving Visual Quality and Transferability of Adversarial Attacks on + Face Recognition Simultaneously with Adversarial Restoration + + +
+ Adversarial face examples possess two critical properties: Visual Quality and +Transferability. However, existing approaches rarely address these properties +simultaneously, leading to subpar results. To address this issue, we propose a +novel adversarial attack technique known as Adversarial Restoration +(AdvRestore), which enhances both visual quality and transferability of +adversarial face examples by leveraging a face restoration prior. In our +approach, we initially train a Restoration Latent Diffusion Model (RLDM) +designed for face restoration. Subsequently, we employ the inference process of +RLDM to generate adversarial face examples. The adversarial perturbations are +applied to the intermediate features of RLDM. Additionally, by treating RLDM +face restoration as a sibling task, the transferability of the generated +adversarial face examples is further improved. Our experimental results +validate the effectiveness of the proposed attack method. + +
+
+
+
+
+ + ♻ ☆ A Majority Invariant Approach to Patch Robustness Certification for Deep + Learning Models + + +
+ Patch robustness certification ensures no patch within a given bound on a +sample can manipulate a deep learning model to predict a different label. +However, existing techniques cannot certify samples that cannot meet their +strict bars at the classifier or patch region levels. This paper proposes +MajorCert. MajorCert firstly finds all possible label sets manipulatable by the +same patch region on the same sample across the underlying classifiers, then +enumerates their combinations element-wise, and finally checks whether the +majority invariant of all these combinations is intact to certify samples. + +
+
+ comment: 5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track +
+
+
+
+
+ + ♻ ☆ Revisiting Token Pruning for Object Detection and Instance Segmentation + + +
+ Vision Transformers (ViTs) have shown impressive performance in computer +vision, but their high computational cost, quadratic in the number of tokens, +limits their adoption in computation-constrained applications. However, this +large number of tokens may not be necessary, as not all tokens are equally +important. In this paper, we investigate token pruning to accelerate inference +for object detection and instance segmentation, extending prior works from +image classification. Through extensive experiments, we offer four insights for +dense tasks: (i) tokens should not be completely pruned and discarded, but +rather preserved in the feature maps for later use. (ii) reactivating +previously pruned tokens can further enhance model performance. (iii) a dynamic +pruning rate based on images is better than a fixed pruning rate. (iv) a +lightweight, 2-layer MLP can effectively prune tokens, achieving accuracy +comparable with complex gating networks with a simpler design. We evaluate the +impact of these design choices on COCO dataset and present a method integrating +these insights that outperforms prior art token pruning models, significantly +reducing performance drop from ~1.5 mAP to ~0.3 mAP for both boxes and masks. +Compared to the dense counterpart that uses all tokens, our method achieves up +to 34% faster inference speed for the whole network and 46% for the backbone. + +
+
+
+
+
+ + ♻ ☆ Cross-Consistent Deep Unfolding Network for Adaptive All-In-One Video + Restoration + + +
+ Existing Video Restoration (VR) methods always necessitate the individual +deployment of models for each adverse weather to remove diverse adverse weather +degradations, lacking the capability for adaptive processing of degradations. +Such limitation amplifies the complexity and deployment costs in practical +applications. To overcome this deficiency, in this paper, we propose a +Cross-consistent Deep Unfolding Network (CDUN) for All-In-One VR, which enables +the employment of a single model to remove diverse degradations for the first +time. Specifically, the proposed CDUN accomplishes a novel iterative +optimization framework, capable of restoring frames corrupted by corresponding +degradations according to the degradation features given in advance. To empower +the framework for eliminating diverse degradations, we devise a Sequence-wise +Adaptive Degradation Estimator (SADE) to estimate degradation features for the +input corrupted video. By orchestrating these two cascading procedures, CDUN +achieves adaptive processing for diverse degradation. In addition, we introduce +a window-based inter-frame fusion strategy to utilize information from more +adjacent frames. This strategy involves the progressive stacking of temporal +windows in multiple iterations, effectively enlarging the temporal receptive +field and enabling each frame's restoration to leverage information from +distant frames. Extensive experiments demonstrate that the proposed method +achieves state-of-the-art performance in All-In-One VR. + +
+
+ comment: 16 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ FCNet: A Convolutional Neural Network for Arbitrary-Length Exposure + Estimation + + +
+ The photographs captured by digital cameras usually suffer from over or under +exposure problems. For image exposure enhancement, the tasks of Single-Exposure +Correction (SEC) and Multi-Exposure Fusion (MEF) are widely studied in the +image processing community. However, current SEC or MEF methods are developed +under different motivations and thus ignore the internal correlation between +SEC and MEF, making it difficult to process arbitrary-length sequences with +improper exposures. Besides, the MEF methods usually fail at estimating the +exposure of a sequence containing only under-exposed or over-exposed images. To +alleviate these problems, in this paper, we develop a novel Fusion-Correction +Network (FCNet) to tackle an arbitrary-length (including one) image sequence +with improper exposures. This is achieved by fusing and correcting an image +sequence by Laplacian Pyramid (LP) image decomposition. In each LP level, the +low-frequency base component of the input image sequence is fed into a Fusion +block and a Correction block sequentially for consecutive exposure estimation, +implemented by alternative exposure fusion and correction. The +exposure-corrected image in current LP level is upsampled and fused with the +high-frequency detail components of the input image sequence in the next LP +level, to output the base component for the Fusion and Correction blocks in +next LP level. Experiments on the benchmark dataset demonstrate that our FCNet +is effective on arbitrary-length exposure estimation, including both SEC and +MEF. The code is publicly released at https://github.com/NKUJinLiang/FCNet. + +
+
+
+
+
+ + ♻ ☆ ISLE: A Framework for Image Level Semantic Segmentation Ensemble + + +
+ One key bottleneck of employing state-of-the-art semantic segmentation +networks in the real world is the availability of training labels. Conventional +semantic segmentation networks require massive pixel-wise annotated labels to +reach state-of-the-art prediction quality. Hence, several works focus on +semantic segmentation networks trained with only image-level annotations. +However, when scrutinizing the results of state-of-the-art in more detail, we +notice that they are remarkably close to each other on average prediction +quality, different approaches perform better in different classes while +providing low quality in others. To address this problem, we propose a novel +framework, ISLE, which employs an ensemble of the "pseudo-labels" for a given +set of different semantic segmentation techniques on a class-wise level. +Pseudo-labels are the pixel-wise predictions of the image-level semantic +segmentation frameworks used to train the final segmentation model. Our +pseudo-labels seamlessly combine the strong points of multiple segmentation +techniques approaches to reach superior prediction quality. We reach up to 2.4% +improvement over ISLE's individual components. An exhaustive analysis was +performed to demonstrate ISLE's effectiveness over state-of-the-art frameworks +for image-level semantic segmentation. + +
+
+ comment: Accepted for Publication at the International Symposium on Visual + Computing (ISVC), October 2023, Lake Tahoe, NV, USA +
+
+
+
+
+ + ♻ ☆ ReFit: A Framework for Refinement of Weakly Supervised Semantic + Segmentation using Object Border Fitting for Medical Images + + +
+ Weakly Supervised Semantic Segmentation (WSSS) relying only on image-level +supervision is a promising approach to deal with the need for Segmentation +networks, especially for generating a large number of pixel-wise masks in a +given dataset. However, most state-of-the-art image-level WSSS techniques lack +an understanding of the geometric features embedded in the images since the +network cannot derive any object boundary information from just image-level +labels. We define a boundary here as the line separating an object and its +background, or two different objects. To address this drawback, we are +proposing our novel ReFit framework, which deploys state-of-the-art class +activation maps combined with various post-processing techniques in order to +achieve fine-grained higher-accuracy segmentation masks. To achieve this, we +investigate a state-of-the-art unsupervised segmentation network that can be +used to construct a boundary map, which enables ReFit to predict object +locations with sharper boundaries. By applying our method to WSSS predictions, +we achieved up to 10% improvement over the current state-of-the-art WSSS +methods for medical imaging. The framework is open-source, to ensure that our +results are reproducible, and accessible online at +https://github.com/bharathprabakaran/ReFit. + +
+
+ comment: Accepted for Publication at the International Symposium on Visual + Computing (ISVC), October 2023, Lake Tahoe, NV, USA +
+
+
+
+
+ + ♻ ☆ Blended-NeRF: Zero-Shot Object Generation and Blending in Existing + Neural Radiance Fields + + +
+ Editing a local region or a specific object in a 3D scene represented by a +NeRF or consistently blending a new realistic object into the scene is +challenging, mainly due to the implicit nature of the scene representation. We +present Blended-NeRF, a robust and flexible framework for editing a specific +region of interest in an existing NeRF scene, based on text prompts, along with +a 3D ROI box. Our method leverages a pretrained language-image model to steer +the synthesis towards a user-provided text prompt, along with a 3D MLP model +initialized on an existing NeRF scene to generate the object and blend it into +a specified region in the original scene. We allow local editing by localizing +a 3D ROI box in the input scene, and blend the content synthesized inside the +ROI with the existing scene using a novel volumetric blending technique. To +obtain natural looking and view-consistent results, we leverage existing and +new geometric priors and 3D augmentations for improving the visual fidelity of +the final result. We test our framework both qualitatively and quantitatively +on a variety of real 3D scenes and text prompts, demonstrating realistic +multi-view consistent results with much flexibility and diversity compared to +the baselines. Finally, we show the applicability of our framework for several +3D editing applications, including adding new objects to a scene, +removing/replacing/altering existing objects, and texture conversion. + +
+
+ comment: 16 pages, 14 figures. Project page: + https://www.vision.huji.ac.il/blended-nerf/ +
+
+
+
+
+ + ♻ ☆ Rad-ReStruct: A Novel VQA Benchmark and Method for Structured Radiology + Reporting MICCAI 2023 + + +
+ Radiology reporting is a crucial part of the communication between +radiologists and other medical professionals, but it can be time-consuming and +error-prone. One approach to alleviate this is structured reporting, which +saves time and enables a more accurate evaluation than free-text reports. +However, there is limited research on automating structured reporting, and no +public benchmark is available for evaluating and comparing different methods. +To close this gap, we introduce Rad-ReStruct, a new benchmark dataset that +provides fine-grained, hierarchically ordered annotations in the form of +structured reports for X-Ray images. We model the structured reporting task as +hierarchical visual question answering (VQA) and propose hi-VQA, a novel method +that considers prior context in the form of previously asked questions and +answers for populating a structured radiology report. Our experiments show that +hi-VQA achieves competitive performance to the state-of-the-art on the medical +VQA benchmark VQARad while performing best among methods without +domain-specific vision-language pretraining and provides a strong baseline on +Rad-ReStruct. Our work represents a significant step towards the automated +population of structured radiology reports and provides a valuable first +benchmark for future research in this area. Our dataset and code is available +at https://github.com/ChantalMP/Rad-ReStruct. + +
+
+ comment: accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ MSViT: Dynamic Mixed-Scale Tokenization for Vision Transformers ICCV + + +
+ The input tokens to Vision Transformers carry little semantic meaning as they +are defined as regular equal-sized patches of the input image, regardless of +its content. However, processing uniform background areas of an image should +not necessitate as much compute as dense, cluttered areas. To address this +issue, we propose a dynamic mixed-scale tokenization scheme for ViT, MSViT. Our +method introduces a conditional gating mechanism that selects the optimal token +scale for every image region, such that the number of tokens is dynamically +determined per input. In addition, to enhance the conditional behavior of the +gate during training, we introduce a novel generalization of the batch-shaping +loss. We show that our gating module is able to learn meaningful semantics +despite operating locally at the coarse patch-level. The proposed gating module +is lightweight, agnostic to the choice of transformer backbone, and trained +within a few epochs with little training overhead. Furthermore, in contrast to +token pruning, MSViT does not lose information about the input, thus can be +readily applied for dense tasks. We validate MSViT on the tasks of +classification and segmentation where it leads to improved accuracy-complexity +trade-off. + +
+
+ comment: ICCV Workshops 2023; Code for the Generalized Batch-Shaping loss is + available at https://github.com/Qualcomm-AI-research/batchshaping +
+
+
+
+
+ + ♻ ☆ Enhancement of Novel View Synthesis Using Omnidirectional Image + Completion + + +
+ In this study, we present a method for synthesizing novel views from a single +360-degree RGB-D image based on the neural radiance field (NeRF) . Prior +studies relied on the neighborhood interpolation capability of multi-layer +perceptrons to complete missing regions caused by occlusion and zooming, which +leads to artifacts. In the method proposed in this study, the input image is +reprojected to 360-degree RGB images at other camera positions, the missing +regions of the reprojected images are completed by a 2D image generative model, +and the completed images are utilized to train the NeRF. Because multiple +completed images contain inconsistencies in 3D, we introduce a method to learn +the NeRF model using a subset of completed images that cover the target scene +with less overlap of completed regions. The selection of such a subset of +images can be attributed to the maximum weight independent set problem, which +is solved through simulated annealing. Experiments demonstrated that the +proposed method can synthesize plausible novel views while preserving the +features of the scene for both artificial and real-world data. + +
+
+ comment: 20 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ DiFaReli: Diffusion Face Relighting ICCV 2023 + + +
+ We present a novel approach to single-view face relighting in the wild. +Handling non-diffuse effects, such as global illumination or cast shadows, has +long been a challenge in face relighting. Prior work often assumes Lambertian +surfaces, simplified lighting models or involves estimating 3D shape, albedo, +or a shadow map. This estimation, however, is error-prone and requires many +training examples with lighting ground truth to generalize well. Our work +bypasses the need for accurate estimation of intrinsic components and can be +trained solely on 2D images without any light stage data, multi-view images, or +lighting ground truth. Our key idea is to leverage a conditional diffusion +implicit model (DDIM) for decoding a disentangled light encoding along with +other encodings related to 3D shape and facial identity inferred from +off-the-shelf estimators. We also propose a novel conditioning technique that +eases the modeling of the complex interaction between light and geometry by +using a rendered shading reference to spatially modulate the DDIM. We achieve +state-of-the-art performance on standard benchmark Multi-PIE and can +photorealistically relight in-the-wild images. Please visit our page: +https://diffusion-face-relighting.github.io + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Exact Diffusion Inversion via Bi-directional Integration Approximation + + +
+ Recently, various methods have been proposed to address the inconsistency +issue of DDIM inversion to enable image editing, such as EDICT [36] and +Null-text inversion [22]. However, the above methods introduce considerable +computational overhead. In this paper, we propose a new technique, named +\emph{bi-directional integration approximation} (BDIA), to perform exact +diffusion inversion with neglible computational overhead. Suppose we would like +to estimate the next diffusion state $\boldsymbol{z}_{i-1}$ at timestep $t_i$ +with the historical information $(i,\boldsymbol{z}_i)$ and +$(i+1,\boldsymbol{z}_{i+1})$. We first obtain the estimated Gaussian noise +$\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i)$, and then apply the DDIM +update procedure twice for approximating the ODE integration over the next +time-slot $[t_i, t_{i-1}]$ in the forward manner and the previous time-slot +$[t_i, t_{t+1}]$ in the backward manner. The DDIM step for the previous +time-slot is used to refine the integration approximation made earlier when +computing $\boldsymbol{z}_i$. A nice property of BDIA-DDIM is that the update +expression for $\boldsymbol{z}_{i-1}$ is a linear combination of +$(\boldsymbol{z}_{i+1}, \boldsymbol{z}_i, +\hat{\boldsymbol{\epsilon}}(\boldsymbol{z}_i,i))$. This allows for exact +backward computation of $\boldsymbol{z}_{i+1}$ given $(\boldsymbol{z}_i, +\boldsymbol{z}_{i-1})$, thus leading to exact diffusion inversion. It is +demonstrated with experiments that (round-trip) BDIA-DDIM is particularly +effective for image editing. Our experiments further show that BDIA-DDIM +produces markedly better image sampling qualities than DDIM for text-to-image +generation. + BDIA can also be applied to improve the performance of other ODE solvers in +addition to DDIM. In our work, it is found that applying BDIA to the EDM +sampling procedure produces new SOTA performance over CIFAR10. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.11328 +
+
+
+
+
+ + ♻ ☆ Self-Reference Deep Adaptive Curve Estimation for Low-Light Image + Enhancement + + +
+ In this paper, we propose a 2-stage low-light image enhancement method called +Self-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage, +we present an intuitive, lightweight, fast, and unsupervised luminance +enhancement algorithm. The algorithm is based on a novel low-light enhancement +curve that can be used to locally boost image brightness. We also propose a new +loss function with a simplified physical model designed to preserve natural +images' color, structure, and fidelity. We use a vanilla CNN to map each pixel +through deep Adaptive Adjustment Curves (AAC) while preserving the local image +structure. Secondly, we introduce the corresponding denoising scheme to remove +the latent noise in the darkness. We approximately model the noise in the dark +and deploy a Denoising-Net to estimate and remove the noise after the first +stage. Exhaustive qualitative and quantitative analysis shows that our method +outperforms existing state-of-the-art algorithms on multiple real-world +datasets. + +
+
+
+
+
+ + ♻ ☆ Light Field Depth Estimation via Stitched Epipolar Plane Images + + +
+ Depth estimation is a fundamental problem in light field processing. +Epipolar-plane image (EPI)-based methods often encounter challenges such as low +accuracy in slope computation due to discretization errors and limited angular +resolution. Besides, existing methods perform well in most regions but struggle +to produce sharp edges in occluded regions and resolve ambiguities in +texture-less regions. To address these issues, we propose the concept of +stitched-EPI (SEPI) to enhance slope computation. SEPI achieves this by +shifting and concatenating lines from different EPIs that correspond to the +same 3D point. Moreover, we introduce the half-SEPI algorithm, which focuses +exclusively on the non-occluded portion of lines to handle occlusion. +Additionally, we present a depth propagation strategy aimed at improving depth +estimation in texture-less regions. This strategy involves determining the +depth of such regions by progressing from the edges towards the interior, +prioritizing accurate regions over coarse regions. Through extensive +experimental evaluations and ablation studies, we validate the effectiveness of +our proposed method. The results demonstrate its superior ability to generate +more accurate and robust depth maps across all regions compared to +state-of-the-art methods. The source code will be publicly available at +https://github.com/PingZhou-LF/Light-Field-Depth-Estimation-Based-on-Stitched-EPIs. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ Layout and Task Aware Instruction Prompt for Zero-shot Document Image + Question Answering + + +
+ Layout-aware pre-trained models has achieved significant progress on document +image question answering. They introduce extra learnable modules into existing +language models to capture layout information within document images from text +bounding box coordinates obtained by OCR tools. However, extra modules +necessitate pre-training on extensive document images. This prevents these +methods from directly utilizing off-the-shelf instruction-tuning language +foundation models, which have recently shown promising potential in zero-shot +learning. Instead, in this paper, we find that instruction-tuning language +models like Claude and ChatGPT can understand layout by spaces and line breaks. +Based on this observation, we propose the LAyout and Task aware Instruction +Prompt (LATIN-Prompt), which consists of layout-aware document content and +task-aware instruction. Specifically, the former uses appropriate spaces and +line breaks to recover the layout information among text segments obtained by +OCR tools, and the latter ensures that generated answers adhere to formatting +requirements. Moreover, we propose the LAyout and Task aware Instruction Tuning +(LATIN-Tuning) to improve the performance of small instruction-tuning models +like Alpaca. Experimental results show that LATIN-Prompt enables zero-shot +performance of Claude and ChatGPT to be comparable to the fine-tuning +performance of SOTAs on document image question answering, and LATIN-Tuning +enhances the zero-shot performance of Alpaca significantly. For example, +LATIN-Prompt improves the performance of Claude and ChatGPT on DocVQA by 263% +and 20% respectively. LATIN-Tuning improves the performance of Alpaca on DocVQA +by 87.7%. Quantitative and qualitative analyses demonstrate the effectiveness +of LATIN-Prompt and LATIN-Tuning. We provide the code in supplementary and will +release it to facilitate future research. + +
+
+ comment: Add the LATIN-Tuning for Alapca. Code is available at + https://github.com/WenjinW/LATIN-Prompt +
+
+
+
+
+ + ♻ ☆ Multimodal Industrial Anomaly Detection via Hybrid Fusion CVPR 2023 + + +
+ 2D-based Industrial Anomaly Detection has been widely discussed, however, +multimodal industrial anomaly detection based on 3D point clouds and RGB images +still has many untouched fields. Existing multimodal industrial anomaly +detection methods directly concatenate the multimodal features, which leads to +a strong disturbance between features and harms the detection performance. In +this paper, we propose Multi-3D-Memory (M3DM), a novel multimodal anomaly +detection method with hybrid fusion scheme: firstly, we design an unsupervised +feature fusion with patch-wise contrastive learning to encourage the +interaction of different modal features; secondly, we use a decision layer +fusion with multiple memory banks to avoid loss of information and additional +novelty classifiers to make the final decision. We further propose a point +feature alignment operation to better align the point cloud and RGB features. +Extensive experiments show that our multimodal industrial anomaly detection +model outperforms the state-of-the-art (SOTA) methods on both detection and +segmentation precision on MVTec-3D AD dataset. Code is available at +https://github.com/nomewang/M3DM. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Joint Super-Resolution and Inverse Tone-Mapping: A Feature Decomposition + Aggregation Network and A New Benchmark + + +
+ Joint Super-Resolution and Inverse Tone-Mapping (joint SR-ITM) aims to +increase the resolution and dynamic range of low-resolution and standard +dynamic range images. Recent networks mainly resort to image decomposition +techniques with complex multi-branch architectures. However, the fixed +decomposition techniques would largely restricts their power on versatile +images. To exploit the potential power of decomposition mechanism, in this +paper, we generalize it from the image domain to the broader feature domain. To +this end, we propose a lightweight Feature Decomposition Aggregation Network +(FDAN). In particular, we design a Feature Decomposition Block (FDB) to achieve +learnable separation of detail and base feature maps, and develop a +Hierarchical Feature Decomposition Group by cascading FDBs for powerful +multi-level feature decomposition. Moreover, to better evaluate the comparison +methods, we collect a large-scale dataset for joint SR-ITM, i.e., SRITM-4K, +which provides versatile scenarios for robust model training and evaluation. +Experimental results on two benchmark datasets demonstrate that our FDAN is +efficient and outperforms state-of-the-art methods on joint SR-ITM. The code of +our FDAN and the SRITM-4K dataset are available at +https://github.com/CS-GangXu/FDAN. + +
+
+ comment: update the authors info and the article template +
+
+
+
+
+ + ♻ ☆ Localization using Multi-Focal Spatial Attention for Masked Face + Recognition + + +
+ Since the beginning of world-wide COVID-19 pandemic, facial masks have been +recommended to limit the spread of the disease. However, these masks hide +certain facial attributes. Hence, it has become difficult for existing face +recognition systems to perform identity verification on masked faces. In this +context, it is necessary to develop masked Face Recognition (MFR) for +contactless biometric recognition systems. Thus, in this paper, we propose +Complementary Attention Learning and Multi-Focal Spatial Attention that +precisely removes masked region by training complementary spatial attention to +focus on two distinct regions: masked regions and backgrounds. In our method, +standard spatial attention and networks focus on unmasked regions, and extract +mask-invariant features while minimizing the loss of the conventional Face +Recognition (FR) performance. For conventional FR, we evaluate the performance +on the IJB-C, Age-DB, CALFW, and CPLFW datasets. We evaluate the MFR +performance on the ICCV2021-MFR/Insightface track, and demonstrate the improved +performance on the both MFR and FR datasets. Additionally, we empirically +verify that spatial attention of proposed method is more precisely activated in +unmasked regions. + +
+
+ comment: Accepted at FG 2023 - InterID Workshop +
+
+
+
+
+ + ♻ ☆ VideoGen: A Reference-Guided Latent Diffusion Approach for High + Definition Text-to-Video Generation + + +
+ In this paper, we present VideoGen, a text-to-video generation approach, +which can generate a high-definition video with high frame fidelity and strong +temporal consistency using reference-guided latent diffusion. We leverage an +off-the-shelf text-to-image generation model, e.g., Stable Diffusion, to +generate an image with high content quality from the text prompt, as a +reference image to guide video generation. Then, we introduce an efficient +cascaded latent diffusion module conditioned on both the reference image and +the text prompt, for generating latent video representations, followed by a +flow-based temporal upsampling step to improve the temporal resolution. +Finally, we map latent video representations into a high-definition video +through an enhanced video decoder. During training, we use the first frame of a +ground-truth video as the reference image for training the cascaded latent +diffusion module. The main characterises of our approach include: the reference +image generated by the text-to-image model improves the visual fidelity; using +it as the condition makes the diffusion model focus more on learning the video +dynamics; and the video decoder is trained over unlabeled video data, thus +benefiting from high-quality easily-available videos. VideoGen sets a new +state-of-the-art in text-to-video generation in terms of both qualitative and +quantitative evaluation. See \url{https://videogen.github.io/VideoGen/} for +more samples. + +
+
+ comment: 8pages, 8figures, project page: https://videogen.github.io/VideoGen/ +
+
+
+
+
+ + ♻ ☆ RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph + Classification + + +
+ Graph classification is a crucial task in many real-world multimedia +applications, where graphs can represent various multimedia data types such as +images, videos, and social networks. Previous efforts have applied graph neural +networks (GNNs) in balanced situations where the class distribution is +balanced. However, real-world data typically exhibit long-tailed class +distributions, resulting in a bias towards the head classes when using GNNs and +limited generalization ability over the tail classes. Recent approaches mainly +focus on re-balancing different classes during model training, which fails to +explicitly introduce new knowledge and sacrifices the performance of the head +classes. To address these drawbacks, we propose a novel framework called +Retrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature +extractor and an unbiased classifier in a decoupled manner. In the feature +extractor training stage, we develop a graph retrieval module to search for +relevant graphs that directly enrich the intra-class diversity for the tail +classes. Moreover, we innovatively optimize a category-centered supervised +contrastive loss to obtain discriminative representations, which is more +suitable for long-tailed scenarios. In the classifier fine-tuning stage, we +balance the classifier weights with two weight regularization techniques, i.e., +Max-norm and weight decay. Experiments on various popular benchmarks verify the +superiority of the proposed method against state-of-the-art approaches. + +
+
+ comment: Accepted by the ACM International Conference on Multimedia (MM) 2023 +
+
+
+
+
+ + ♻ ☆ Understanding Prompt Tuning for V-L Models Through the Lens of Neural + Collapse + + +
+ Large-scale vision-language (V-L) models have demonstrated remarkable +generalization capabilities for downstream tasks through prompt tuning. +However, the mechanisms behind the learned text representations are unknown, +limiting further generalization gains, especially under class imbalance +scenarios. Recent advances in the neural collapse (NC) phenomenon of +vision-only models suggest that the optimal representation structure is the +simplex ETF, which paves the way to study representations in V-L models. In +this paper, we make the first attempt to use NC for examining the +representations in V-L models via prompt tuning. It is found that NC optimality +of text-to-image representations shows a positive correlation with downstream +generalizability, which is more severe under class imbalance settings. To +improve the representations, we propose Neural-collapse-anchored Prompt Tuning +(NPT), a novel method that learns prompts with text and image representations +that satisfy the same simplex ETF. NPT incorporates two regularization terms: +language-modality collapse and multi-modality isomorphism; and it is compatible +with other prompt tuning methods. Extensive experiments show that NPT can +consistently help to improve existing prompt tuning techniques across 11 +datasets for both balanced and imbalanced settings. + +
+
+
+
+
+ + ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through + Unexploitable Data with Learnable Examples + + +
+ Safeguarding data from unauthorized exploitation is vital for privacy and +security, especially in recent rampant research in security breach such as +adversarial/membership attacks. To this end, \textit{unlearnable examples} +(UEs) have been recently proposed as a compelling protection, by adding +imperceptible perturbation to data so that models trained on them cannot +classify them accurately on original clean distribution. Unfortunately, we find +UEs provide a false sense of security, because they cannot stop unauthorized +users from utilizing other unprotected data to remove the protection, by +turning unlearnable data into learnable again. Motivated by this observation, +we formally define a new threat by introducing \textit{learnable unauthorized +examples} (LEs) which are UEs with their protection removed. The core of this +approach is a novel purification process that projects UEs onto the manifold of +LEs. This is realized by a new joint-conditional diffusion model which denoises +UEs conditioned on the pixel and perceptual similarity between UEs and LEs. +Extensive experiments demonstrate that LE delivers state-of-the-art countering +performance against both supervised UEs and unsupervised UEs in various +scenarios, which is the first generalizable countermeasure to UEs across +supervised learning and unsupervised learning. Our code is available at +\url{https://github.com/jiangw-0/LE_JCDP}. + +
+
+ comment: Accepted in MM 2023 +
+
+
+
+
+ + ♻ ☆ Autonomous Agriculture Robot for Smart Farming + + +
+ This project aims to develop and demonstrate a ground robot with intelligence +capable of conducting semi-autonomous farm operations for different low-heights +vegetable crops referred as Agriculture Application Robot(AAR). AAR is a +lightweight, solar-electric powered robot that uses intelligent perception for +conducting detection and classification of plants and their characteristics. +The system also has a robotic arm for the autonomous weed cutting process. The +robot can deliver fertilizer spraying, insecticide, herbicide, and other fluids +to the targets such as crops, weeds, and other pests. Besides, it provides +information for future research into higher-level tasks such as yield +estimation, crop, and soil health monitoring. We present the design of robot +and the associated experiments which show the promising results in real world +environments. + +
+
+ comment: Due to author interest conflicts +
+
+
+
+
+ + ♻ ☆ DRTAM: Dual Rank-1 Tensor Attention Module + + +
+ Recently, attention mechanisms have been extensively investigated in computer +vision, but few of them show excellent performance on both large and mobile +networks. This paper proposes Dual Rank-1 Tensor Attention Module (DRTAM), a +novel residual-attention-learning-guided attention module for feed-forward +convolutional neural networks. Given a 3D feature tensor map, DRTAM firstly +generates three 2D feature descriptors along three axes. Then, using three +descriptors, DRTAM sequentially infers two rank-1 tensor attention maps, the +initial attention map and the complement attention map, combines and multiplied +them to the input feature map for adaptive feature refinement(see Fig.1(c)). To +generate two attention maps, DRTAM introduces rank-1 tensor attention module +(RTAM) and residual descriptors extraction module (RDEM): RTAM divides each 2D +feature descriptors into several chunks, and generate three factor vectors of a +rank-1 tensor attention map by employing strip pooling on each chunk so that +local and long-range contextual information can be captured along three +dimension respectively; RDEM generates three 2D feature descriptors of the +residual feature to produce the complement attention map, using three factor +vectors of the initial attention map and three descriptors of the input +feature. Extensive experimental results on ImageNet-1K, MS COCO and PASCAL VOC +demonstrate that DRTAM achieves competitive performance on both large and +mobile networks compare with other state-of-the-art attention modules. + +
+
+ comment: There exists some problems on the experiments. Besides, we find that + the sturcture of DRTAM can be optimized +
+
+
+
+
+ + ♻ ☆ EGformer: Equirectangular Geometry-biased Transformer for 360 Depth + Estimation ICCV23 + + +
+ Estimating the depths of equirectangular (i.e., 360) images (EIs) is +challenging given the distorted 180 x 360 field-of-view, which is hard to be +addressed via convolutional neural network (CNN). Although a transformer with +global attention achieves significant improvements over CNN for EI depth +estimation task, it is computationally inefficient, which raises the need for +transformer with local attention. However, to apply local attention +successfully for EIs, a specific strategy, which addresses distorted +equirectangular geometry and limited receptive field simultaneously, is +required. Prior works have only cared either of them, resulting in +unsatisfactory depths occasionally. In this paper, we propose an +equirectangular geometry-biased transformer termed EGformer. While limiting the +computational cost and the number of network parameters, EGformer enables the +extraction of the equirectangular geometry-aware local attention with a large +receptive field. To achieve this, we actively utilize the equirectangular +geometry as the bias for the local attention instead of struggling to reduce +the distortion of EIs. As compared to the most recent EI depth estimation +studies, the proposed approach yields the best depth outcomes overall with the +lowest computational cost and the fewest parameters, demonstrating the +effectiveness of the proposed methods. + +
+
+ comment: 12 pages, Accepted to ICCV23, Camera ready version +
+
+
+
+
+ + ♻ ☆ Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models ICCV 2023 + + +
+ With the overwhelming trend of mask image modeling led by MAE, generative +pre-training has shown a remarkable potential to boost the performance of +fundamental models in 2D vision. However, in 3D vision, the over-reliance on +Transformer-based backbones and the unordered nature of point clouds have +restricted the further development of generative pre-training. In this paper, +we propose a novel 3D-to-2D generative pre-training method that is adaptable to +any point cloud model. We propose to generate view images from different +instructed poses via the cross-attention mechanism as the pre-training scheme. +Generating view images has more precise supervision than its point cloud +counterpart, thus assisting 3D backbones to have a finer comprehension of the +geometrical structure and stereoscopic relations of the point cloud. +Experimental results have proved the superiority of our proposed 3D-to-2D +generative pre-training over previous pre-training methods. Our method is also +effective in boosting the performance of architecture-oriented approaches, +achieving state-of-the-art performance when fine-tuning on ScanObjectNN +classification and ShapeNetPart segmentation tasks. Code is available at +https://github.com/wangzy22/TAP. + +
+
+ comment: Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz +
+
+
+
+
+ + ♻ ☆ ClipSitu: Effectively Leveraging CLIP for Conditional Predictions in + Situation Recognition + + +
+ Situation Recognition is the task of generating a structured summary of what +is happening in an image using an activity verb and the semantic roles played +by actors and objects. In this task, the same activity verb can describe a +diverse set of situations as well as the same actor or object category can play +a diverse set of semantic roles depending on the situation depicted in the +image. Hence a situation recognition model needs to understand the context of +the image and the visual-linguistic meaning of semantic roles. Therefore, we +leverage the CLIP foundational model that has learned the context of images via +language descriptions. We show that deeper-and-wider multi-layer perceptron +(MLP) blocks obtain noteworthy results for the situation recognition task by +using CLIP image and text embedding features and it even outperforms the +state-of-the-art CoFormer, a Transformer-based model, thanks to the external +implicit visual-linguistic knowledge encapsulated by CLIP and the expressive +power of modern MLP block designs. Motivated by this, we design a +cross-attention-based Transformer using CLIP visual tokens that model the +relation between textual roles and visual entities. Our cross-attention-based +Transformer known as ClipSitu XTF outperforms existing state-of-the-art by a +large margin of 14.1\% on semantic role labelling (value) for top-1 accuracy +using imSitu dataset. {Similarly, our ClipSitu XTF obtains state-of-the-art +situation localization performance.} We will make the code publicly available. + +
+
+ comment: State-of-the-art results on Grounded Situation Recognition +
+
+
+
+
+ + ♻ ☆ DMKD: Improving Feature-based Knowledge Distillation for Object + Detection Via Dual Masking Augmentation + + +
+ Recent mainstream masked distillation methods function by reconstructing +selectively masked areas of a student network from the feature map of its +teacher counterpart. In these methods, the masked regions need to be properly +selected, such that reconstructed features encode sufficient discrimination and +representation capability like the teacher feature. However, previous masked +distillation methods only focus on spatial masking, making the resulting masked +areas biased towards spatial importance without encoding informative channel +clues. In this study, we devise a Dual Masked Knowledge Distillation (DMKD) +framework which can capture both spatially important and channel-wise +informative clues for comprehensive masked feature reconstruction. More +specifically, we employ dual attention mechanism for guiding the respective +masking branches, leading to reconstructed feature encoding dual significance. +Furthermore, fusing the reconstructed features is achieved by self-adjustable +weighting strategy for effective feature distillation. Our experiments on +object detection task demonstrate that the student networks achieve performance +gains of 4.1% and 4.3% with the help of our method when RetinaNet and Cascade +Mask R-CNN are respectively used as the teacher networks, while outperforming +the other state-of-the-art distillation methods. + +
+
+
+
+
+ + ♻ ☆ Bandwidth-efficient Inference for Neural Image Compression ICASSP 2024 + + +
+ With neural networks growing deeper and feature maps growing larger, limited +communication bandwidth with external memory (or DRAM) and power constraints +become a bottleneck in implementing network inference on mobile and edge +devices. In this paper, we propose an end-to-end differentiable bandwidth +efficient neural inference method with the activation compressed by neural data +compression method. Specifically, we propose a transform-quantization-entropy +coding pipeline for activation compression with symmetric exponential Golomb +coding and a data-dependent Gaussian entropy model for arithmetic coding. +Optimized with existing model quantization methods, low-level task of image +compression can achieve up to 19x bandwidth reduction with 6.21x energy saving. + +
+
+ comment: 9 pages, 6 figures, submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Pixel-Aware Stable Diffusion for Realistic Image Super-resolution and + Personalized Stylization + + +
+ Realistic image super-resolution (Real-ISR) aims to reproduce perceptually +realistic image details from a low-quality input. The commonly used adversarial +training based Real-ISR methods often introduce unnatural visual artifacts and +fail to generate realistic textures for natural scene images. The recently +developed generative stable diffusion models provide a potential solution to +Real-ISR with pre-learned strong image priors. However, the existing methods +along this line either fail to keep faithful pixel-wise image structures or +resort to extra skipped connections to reproduce details, which requires +additional training in image space and limits their extension to other related +tasks in latent space such as image stylization. In this work, we propose a +pixel-aware stable diffusion (PASD) network to achieve robust Real-ISR as well +as personalized stylization. In specific, a pixel-aware cross attention module +is introduced to enable diffusion models perceiving image local structures in +pixel-wise level, while a degradation removal module is used to extract +degradation insensitive features to guide the diffusion process together with +image high level information. By simply replacing the base diffusion model with +a personalized one, our method can generate diverse stylized images without the +need to collect pairwise training data. PASD can be easily integrated into +existing diffusion models such as Stable Diffusion. Experiments on Real-ISR and +personalized stylization demonstrate the effectiveness of our proposed +approach. The source code and models can be found at +\url{https://github.com/yangxy/PASD}. + +
+
+
+
+
+ + ♻ ☆ Exploring the Robustness of Human Parsers Towards Common Corruptions + + +
+ Human parsing aims to segment each pixel of the human image with fine-grained +semantic categories. However, current human parsers trained with clean data are +easily confused by numerous image corruptions such as blur and noise. To +improve the robustness of human parsers, in this paper, we construct three +corruption robustness benchmarks, termed LIP-C, ATR-C, and +Pascal-Person-Part-C, to assist us in evaluating the risk tolerance of human +parsing models. Inspired by the data augmentation strategy, we propose a novel +heterogeneous augmentation-enhanced mechanism to bolster robustness under +commonly corrupted conditions. Specifically, two types of data augmentations +from different views, i.e., image-aware augmentation and model-aware +image-to-image transformation, are integrated in a sequential manner for +adapting to unforeseen image corruptions. The image-aware augmentation can +enrich the high diversity of training images with the help of common image +operations. The model-aware augmentation strategy that improves the diversity +of input data by considering the model's randomness. The proposed method is +model-agnostic, and it can plug and play into arbitrary state-of-the-art human +parsing frameworks. The experimental results show that the proposed method +demonstrates good universality which can improve the robustness of the human +parsing models and even the semantic segmentation models when facing various +image common corruptions. Meanwhile, it can still obtain approximate +performance on clean data. + +
+
+ comment: Accepted by IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ♻ ☆ Implicit Neural Image Stitching With Enhanced and Blended Feature + Reconstruction + + +
+ Existing frameworks for image stitching often provide visually reasonable +stitchings. However, they suffer from blurry artifacts and disparities in +illumination, depth level, etc. Although the recent learning-based stitchings +relax such disparities, the required methods impose sacrifice of image +qualities failing to capture high-frequency details for stitched images. To +address the problem, we propose a novel approach, implicit Neural Image +Stitching (NIS) that extends arbitrary-scale super-resolution. Our method +estimates Fourier coefficients of images for quality-enhancing warps. Then, the +suggested model blends color mismatches and misalignment in the latent space +and decodes the features into RGB values of stitched images. Our experiments +show that our approach achieves improvement in resolving the low-definition +imaging of the previous deep image stitching with favorable accelerated +image-enhancing methods. Our source code is available at +https://github.com/minshu-kim/NIS. + +
+
+
+
+
+ + ♻ ☆ Learning Residual Elastic Warps for Image Stitching under Dirichlet + Boundary Condition + + +
+ Trendy suggestions for learning-based elastic warps enable the deep image +stitchings to align images exposed to large parallax errors. Despite the +remarkable alignments, the methods struggle with occasional holes or +discontinuity between overlapping and non-overlapping regions of a target image +as the applied training strategy mostly focuses on overlap region alignment. As +a result, they require additional modules such as seam finder and image +inpainting for hiding discontinuity and filling holes, respectively. In this +work, we suggest Recurrent Elastic Warps (REwarp) that address the problem with +Dirichlet boundary condition and boost performances by residual learning for +recurrent misalign correction. Specifically, REwarp predicts a homography and a +Thin-plate Spline (TPS) under the boundary constraint for discontinuity and +hole-free image stitching. Our experiments show the favorable aligns and the +competitive computational costs of REwarp compared to the existing stitching +methods. Our source code is available at https://github.com/minshu-kim/REwarp. + +
+
+
+
+
+ + ♻ ☆ Internet Explorer: Targeted Representation Learning on the Open Web ICML 2023 + + +
+ Modern vision models typically rely on fine-tuning general-purpose models +pre-trained on large, static datasets. These general-purpose models only +capture the knowledge within their pre-training datasets, which are tiny, +out-of-date snapshots of the Internet -- where billions of images are uploaded +each day. We suggest an alternate approach: rather than hoping our static +datasets transfer to our desired tasks after large-scale pre-training, we +propose dynamically utilizing the Internet to quickly train a small-scale model +that does extremely well on the task at hand. Our approach, called Internet +Explorer, explores the web in a self-supervised manner to progressively find +relevant examples that improve performance on a desired target dataset. It +cycles between searching for images on the Internet with text queries, +self-supervised training on downloaded images, determining which images were +useful, and prioritizing what to search for next. We evaluate Internet Explorer +across several datasets and show that it outperforms or matches CLIP oracle +performance by using just a single GPU desktop to actively query the Internet +for 30--40 hours. Results, visualizations, and videos at +https://internet-explorer-ssl.github.io/ + +
+
+ comment: In ICML 2023. Website at https://internet-explorer-ssl.github.io/ +
+
+
+
+
+ + ♻ ☆ Domain Adaptation for Efficiently Fine-tuning Vision Transformer with + Encrypted Images + + +
+ In recent years, deep neural networks (DNNs) trained with transformed data +have been applied to various applications such as privacy-preserving learning, +access control, and adversarial defenses. However, the use of transformed data +decreases the performance of models. Accordingly, in this paper, we propose a +novel method for fine-tuning models with transformed images under the use of +the vision transformer (ViT). The proposed domain adaptation method does not +cause the accuracy degradation of models, and it is carried out on the basis of +the embedding structure of ViT. In experiments, we confirmed that the proposed +method prevents accuracy degradation even when using encrypted images with the +CIFAR-10 and CIFAR-100 datasets. + +
+
+ comment: Accepted by APSIPA 2023 +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Extending Transductive Knowledge Graph Embedding Models for Inductive + Logical Relational Inference + + +
+ Many downstream inference tasks for knowledge graphs, such as relation +prediction, have been handled successfully by knowledge graph embedding +techniques in the transductive setting. To address the inductive setting +wherein new entities are introduced into the knowledge graph at inference time, +more recent work opts for models which learn implicit representations of the +knowledge graph through a complex function of a network's subgraph structure, +often parametrized by graph neural network architectures. These come at the +cost of increased parametrization, reduced interpretability and limited +generalization to other downstream inference tasks. In this work, we bridge the +gap between traditional transductive knowledge graph embedding approaches and +more recent inductive relation prediction models by introducing a generalized +form of harmonic extension which leverages representations learned through +transductive embedding methods to infer representations of new entities +introduced at inference time as in the inductive setting. This harmonic +extension technique provides the best such approximation, can be implemented +via an efficient iterative scheme, and can be employed to answer a family of +conjunctive logical queries over the knowledge graph, further expanding the +capabilities of transductive embedding methods. In experiments on a number of +large-scale knowledge graph embedding benchmarks, we find that this approach +for extending the functionality of transductive knowledge graph embedding +models to perform knowledge graph completion and answer logical queries in the +inductive setting is competitive with--and in some scenarios +outperforms--several state-of-the-art models derived explicitly for such +inductive tasks. + +
+
+
+
+
+ + ☆ VideolandGPT: A User Study on a Conversational Recommender System RecSys2023 + + +
+ This paper investigates how large language models (LLMs) can enhance +recommender systems, with a specific focus on Conversational Recommender +Systems that leverage user preferences and personalised candidate selections +from existing ranking models. We introduce VideolandGPT, a recommender system +for a Video-on-Demand (VOD) platform, Videoland, which uses ChatGPT to select +from a predetermined set of contents, considering the additional context +indicated by users' interactions with a chat interface. We evaluate ranking +metrics, user experience, and fairness of recommendations, comparing a +personalised and a non-personalised version of the system, in a between-subject +user study. Our results indicate that the personalised version outperforms the +non-personalised in terms of accuracy and general user satisfaction, while both +versions increase the visibility of items which are not in the top of the +recommendation lists. However, both versions present inconsistent behavior in +terms of fairness, as the system may generate recommendations which are not +available on Videoland. + +
+
+ comment: Preprint for KARS2023 (5th Knowledge-aware and Conversational + Recommender Systems Workshop at RecSys2023) +
+
+
+
+
+ + ☆ Evaluating ChatGPT as a Recommender System: A Rigorous Approach + + +
+ Recent popularity surrounds large AI language models due to their impressive +natural language capabilities. They contribute significantly to +language-related tasks, including prompt-based learning, making them valuable +for various specific tasks. This approach unlocks their full potential, +enhancing precision and generalization. Research communities are actively +exploring their applications, with ChatGPT receiving recognition. Despite +extensive research on large language models, their potential in recommendation +scenarios still needs to be explored. This study aims to fill this gap by +investigating ChatGPT's capabilities as a zero-shot recommender system. Our +goals include evaluating its ability to use user preferences for +recommendations, reordering existing recommendation lists, leveraging +information from similar users, and handling cold-start situations. We assess +ChatGPT's performance through comprehensive experiments using three datasets +(MovieLens Small, Last.FM, and Facebook Book). We compare ChatGPT's performance +against standard recommendation algorithms and other large language models, +such as GPT-3.5 and PaLM-2. To measure recommendation effectiveness, we employ +widely-used evaluation metrics like Mean Average Precision (MAP), Recall, +Precision, F1, normalized Discounted Cumulative Gain (nDCG), Item Coverage, +Expected Popularity Complement (EPC), Average Coverage of Long Tail (ACLT), +Average Recommendation Popularity (ARP), and Popularity-based Ranking-based +Equal Opportunity (PopREO). Through thoroughly exploring ChatGPT's abilities in +recommender systems, our study aims to contribute to the growing body of +research on the versatility and potential applications of large language +models. Our experiment code is available on the GitHub repository: +https://github.com/sisinflab/Recommender-ChatGPT + +
+
+
+
+
+ + ☆ Learning Compact Compositional Embeddings via Regularized Pruning for + Recommendation ICDM '23 + + +
+ Latent factor models are the dominant backbones of contemporary recommender +systems (RSs) given their performance advantages, where a unique vector +embedding with a fixed dimensionality (e.g., 128) is required to represent each +entity (commonly a user/item). Due to the large number of users and items on +e-commerce sites, the embedding table is arguably the least memory-efficient +component of RSs. For any lightweight recommender that aims to efficiently +scale with the growing size of users/items or to remain applicable in +resource-constrained settings, existing solutions either reduce the number of +embeddings needed via hashing, or sparsify the full embedding table to switch +off selected embedding dimensions. However, as hash collision arises or +embeddings become overly sparse, especially when adapting to a tighter memory +budget, those lightweight recommenders inevitably have to compromise their +accuracy. To this end, we propose a novel compact embedding framework for RSs, +namely Compositional Embedding with Regularized Pruning (CERP). Specifically, +CERP represents each entity by combining a pair of embeddings from two +independent, substantially smaller meta-embedding tables, which are then +jointly pruned via a learnable element-wise threshold. In addition, we +innovatively design a regularized pruning mechanism in CERP, such that the two +sparsified meta-embedding tables are encouraged to encode information that is +mutually complementary. Given the compatibility with agnostic latent factor +models, we pair CERP with two popular recommendation models for extensive +experiments, where results on two real-world datasets under different memory +budgets demonstrate its superiority against state-of-the-art baselines. The +codebase of CERP is available in https://github.com/xurong-liang/CERP. + +
+
+ comment: Accepted by ICDM '23 +
+
+
+
+
+ + ☆ Behind Recommender Systems: the Geography of the ACM RecSys Community + + +
+ The amount and dissemination rate of media content accessible online is +nowadays overwhelming. Recommender Systems filter this information into +manageable streams or feeds, adapted to our personal needs or preferences. It +is of utter importance that algorithms employed to filter information do not +distort or cut out important elements from our perspectives of the world. Under +this principle, it is essential to involve diverse views and teams from the +earliest stages of their design and development. This has been highlighted, for +instance, in recent European Union regulations such as the Digital Services +Act, via the requirement of risk monitoring, including the risk of +discrimination, and the AI Act, through the requirement to involve people with +diverse backgrounds in the development of AI systems. We look into the +geographic diversity of the recommender systems research community, +specifically by analyzing the affiliation countries of the authors who +contributed to the ACM Conference on Recommender Systems (RecSys) during the +last 15 years. This study has been carried out in the framework of the +Diversity in AI - DivinAI project, whose main objective is the long-term +monitoring of diversity in AI forums through a set of indexes. + +
+
+ comment: Presented at the 6th FAccTRec Workshop: Responsible Recommendation + (FAccTRec '23), September 18, 2023, Singapore +
+
+
+
+
+ + ♻ ☆ A Decade of Scholarly Research on Open Knowledge Graphs + + +
+ The proliferation of open knowledge graphs has led to a surge in scholarly +research on the topic over the past decade. This paper presents a bibliometric +analysis of the scholarly literature on open knowledge graphs published between +2013 and 2023. The study aims to identify the trends, patterns, and impact of +research in this field, as well as the key topics and research questions that +have emerged. The work uses bibliometric techniques to analyze a sample of 4445 +scholarly articles retrieved from Scopus. The findings reveal an +ever-increasing number of publications on open knowledge graphs published every +year, particularly in developed countries (+50 per year). These outputs are +published in highly-referred scholarly journals and conferences. The study +identifies three main research themes: (1) knowledge graph construction and +enrichment, (2) evaluation and reuse, and (3) fusion of knowledge graphs into +NLP systems. Within these themes, the study identifies specific tasks that have +received considerable attention, including entity linking, knowledge graph +embedding, and graph neural networks. + +
+
+
+
+
+ + ♻ ☆ ZC3: Zero-Shot Cross-Language Code Clone Detection + + +
+ Developers introduce code clones to improve programming productivity. Many +existing studies have achieved impressive performance in monolingual code clone +detection. However, during software development, more and more developers write +semantically equivalent programs with different languages to support different +platforms and help developers translate projects from one language to another. +Considering that collecting cross-language parallel data, especially for +low-resource languages, is expensive and time-consuming, how designing an +effective cross-language model that does not rely on any parallel data is a +significant problem. In this paper, we propose a novel method named ZC3 for +Zero-shot Cross-language Code Clone detection. ZC3 designs the contrastive +snippet prediction to form an isomorphic representation space among different +programming languages. Based on this, ZC3 exploits domain-aware learning and +cycle consistency learning to further constrain the model to generate +representations that are aligned among different languages meanwhile are +diacritical for different types of clones. To evaluate our approach, we conduct +extensive experiments on four representative cross-language clone detection +datasets. Experimental results show that ZC3 outperforms the state-of-the-art +baselines by 67.12%, 51.39%, 14.85%, and 53.01% on the MAP score, respectively. +We further investigate the representational distribution of different languages +and discuss the effectiveness of our method. + +
+
+ comment: Accepted by the 38th IEEE/ACM International Conference on Automated + Software Engineering (ASE 2023) +
+
+
+
+
+ + ♻ ☆ RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph + Classification + + +
+ Graph classification is a crucial task in many real-world multimedia +applications, where graphs can represent various multimedia data types such as +images, videos, and social networks. Previous efforts have applied graph neural +networks (GNNs) in balanced situations where the class distribution is +balanced. However, real-world data typically exhibit long-tailed class +distributions, resulting in a bias towards the head classes when using GNNs and +limited generalization ability over the tail classes. Recent approaches mainly +focus on re-balancing different classes during model training, which fails to +explicitly introduce new knowledge and sacrifices the performance of the head +classes. To address these drawbacks, we propose a novel framework called +Retrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature +extractor and an unbiased classifier in a decoupled manner. In the feature +extractor training stage, we develop a graph retrieval module to search for +relevant graphs that directly enrich the intra-class diversity for the tail +classes. Moreover, we innovatively optimize a category-centered supervised +contrastive loss to obtain discriminative representations, which is more +suitable for long-tailed scenarios. In the classifier fine-tuning stage, we +balance the classifier weights with two weight regularization techniques, i.e., +Max-norm and weight decay. Experiments on various popular benchmarks verify the +superiority of the proposed method against state-of-the-art approaches. + +
+
+ comment: Accepted by the ACM International Conference on Multimedia (MM) 2023 +
+
+
+
+
+ + ♻ ☆ Impression-Informed Multi-Behavior Recommender System: A Hierarchical + Graph Attention Approach + + +
+ While recommender systems have significantly benefited from implicit +feedback, they have often missed the nuances of multi-behavior interactions +between users and items. Historically, these systems either amalgamated all +behaviors, such as \textit{impression} (formerly \textit{view}), +\textit{add-to-cart}, and \textit{buy}, under a singular 'interaction' label, +or prioritized only the target behavior, often the \textit{buy} action, +discarding valuable auxiliary signals. Although recent advancements tried +addressing this simplification, they primarily gravitated towards optimizing +the target behavior alone, battling with data scarcity. Additionally, they +tended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these +gaps, we introduce the \textbf{H}ierarchical \textbf{M}ulti-behavior +\textbf{G}raph Attention \textbf{N}etwork (HMGN). This pioneering framework +leverages attention mechanisms to discern information from both inter and +intra-behaviors while employing a multi-task Hierarchical Bayesian Personalized +Ranking (HBPR) for optimization. Recognizing the need for scalability, our +approach integrates a specialized multi-behavior sub-graph sampling technique. +Moreover, the adaptability of HMGN allows for the seamless inclusion of +knowledge metadata and time-series data. Empirical results attest to our +model's prowess, registering a notable performance boost of up to 64\% in +NDCG@100 metrics over conventional graph neural network methods. + +
+
+
+
+
+
+
+
+ + Machine Learning 131 + +
+
+
+ + ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+ + ☆ DiffusionEngine: Diffusion Model is Scalable Data Engine for Object + Detection + + +
+ Data is the cornerstone of deep learning. This paper reveals that the +recently developed Diffusion Model is a scalable data engine for object +detection. Existing methods for scaling up detection-oriented data often +require manual collection or generative models to obtain target images, +followed by data augmentation and labeling to produce training pairs, which are +costly, complex, or lacking diversity. To address these issues, we +presentDiffusionEngine (DE), a data scaling-up engine that provides +high-quality detection-oriented training pairs in a single stage. DE consists +of a pre-trained diffusion model and an effective Detection-Adapter, +contributing to generating scalable, diverse and generalizable detection data +in a plug-and-play manner. Detection-Adapter is learned to align the implicit +semantic and location knowledge in off-the-shelf diffusion models with +detection-aware signals to make better bounding-box predictions. Additionally, +we contribute two datasets, i.e., COCO-DE and VOC-DE, to scale up existing +detection benchmarks for facilitating follow-up research. Extensive experiments +demonstrate that data scaling-up via DE can achieve significant improvements in +diverse scenarios, such as various detection algorithms, self-supervised +pre-training, data-sparse, label-scarce, cross-domain, and semi-supervised +learning. For example, when using DE with a DINO-based adapter to scale up +data, mAP is improved by 3.1% on COCO, 7.6% on VOC, and 11.5% on Clipart. + +
+
+ comment: Code and Models are publicly available. Project Page: + https://mettyz.github.io/DiffusionEngine +
+
+
+
+
+ + ☆ ArtiGrasp: Physically Plausible Synthesis of Bi-Manual Dexterous + Grasping and Articulation + + +
+ We present ArtiGrasp, a novel method to synthesize bi-manual hand-object +interactions that include grasping and articulation. This task is challenging +due to the diversity of the global wrist motions and the precise finger control +that are necessary to articulate objects. ArtiGrasp leverages reinforcement +learning and physics simulations to train a policy that controls the global and +local hand pose. Our framework unifies grasping and articulation within a +single policy guided by a single hand pose reference. Moreover, to facilitate +the training of the precise finger control required for articulation, we +present a learning curriculum with increasing difficulty. It starts with +single-hand manipulation of stationary objects and continues with multi-agent +training including both hands and non-stationary objects. To evaluate our +method, we introduce Dynamic Object Grasping and Articulation, a task that +involves bringing an object into a target articulated pose. This task requires +grasping, relocation, and articulation. We show our method's efficacy towards +this task. We further demonstrate that our method can generate motions with +noisy hand-object pose estimates from an off-the-shelf image-based regressor. + +
+
+ comment: Project page: https://eth-ait.github.io/artigrasp/ +
+
+
+
+
+ + ☆ A Function Interpretation Benchmark for Evaluating Interpretability + Methods + + +
+ Labeling neural network submodules with human-legible descriptions is useful +for many downstream tasks: such descriptions can surface failures, guide +interventions, and perhaps even explain important model behaviors. To date, +most mechanistic descriptions of trained networks have involved small models, +narrowly delimited phenomena, and large amounts of human labor. Labeling all +human-interpretable sub-computations in models of increasing size and +complexity will almost certainly require tools that can generate and validate +descriptions automatically. Recently, techniques that use learned models +in-the-loop for labeling have begun to gain traction, but methods for +evaluating their efficacy are limited and ad-hoc. How should we validate and +compare open-ended labeling tools? This paper introduces FIND (Function +INterpretation and Description), a benchmark suite for evaluating the building +blocks of automated interpretability methods. FIND contains functions that +resemble components of trained neural networks, and accompanying descriptions +of the kind we seek to generate. The functions are procedurally constructed +across textual and numeric domains, and involve a range of real-world +complexities, including noise, composition, approximation, and bias. We +evaluate new and existing methods that use language models (LMs) to produce +code-based and language descriptions of function behavior. We find that an +off-the-shelf LM augmented with only black-box access to functions can +sometimes infer their structure, acting as a scientist by forming hypotheses, +proposing experiments, and updating descriptions in light of new data. However, +LM-based descriptions tend to capture global function behavior and miss local +corruptions. These results show that FIND will be useful for characterizing the +performance of more sophisticated interpretability methods before they are +applied to real-world models. + +
+
+ comment: 25 pages, 7 figures +
+
+
+
+
+ + ☆ DoLa: Decoding by Contrasting Layers Improves Factuality in Large + Language Models + + +
+ Despite their impressive capabilities, large language models (LLMs) are prone +to hallucinations, i.e., generating content that deviates from facts seen +during pretraining. We propose a simple decoding strategy for reducing +hallucinations with pretrained LLMs that does not require conditioning on +retrieved external knowledge nor additional fine-tuning. Our approach obtains +the next-token distribution by contrasting the differences in logits obtained +from projecting the later layers versus earlier layers to the vocabulary space, +exploiting the fact that factual knowledge in an LLMs has generally been shown +to be localized to particular transformer layers. We find that this Decoding by +Contrasting Layers (DoLa) approach is able to better surface factual knowledge +and reduce the generation of incorrect facts. DoLa consistently improves the +truthfulness across multiple choices tasks and open-ended generation tasks, for +example improving the performance of LLaMA family models on TruthfulQA by +12-17% absolute points, demonstrating its potential in making LLMs reliably +generate truthful facts. + +
+
+ comment: The source code is available at https://github.com/voidism/DoLa +
+
+
+
+
+ + ☆ Better Practices for Domain Adaptation + + +
+ Distribution shifts are all too common in real-world applications of machine +learning. Domain adaptation (DA) aims to address this by providing various +frameworks for adapting models to the deployment data without using labels. +However, the domain shift scenario raises a second more subtle challenge: the +difficulty of performing hyperparameter optimisation (HPO) for these adaptation +algorithms without access to a labelled validation set. The unclear validation +protocol for DA has led to bad practices in the literature, such as performing +HPO using the target test labels when, in real-world scenarios, they are not +available. This has resulted in over-optimism about DA research progress +compared to reality. In this paper, we analyse the state of DA when using good +evaluation practice, by benchmarking a suite of candidate validation criteria +and using them to assess popular adaptation algorithms. We show that there are +challenges across all three branches of domain adaptation methodology including +Unsupervised Domain Adaptation (UDA), Source-Free Domain Adaptation (SFDA), and +Test Time Adaptation (TTA). While the results show that realistically +achievable performance is often worse than expected, they also show that using +proper validation splits is beneficial, as well as showing that some previously +unexplored validation metrics provide the best options to date. Altogether, our +improved practices covering data, training, validation and hyperparameter +optimisation form a new rigorous pipeline to improve benchmarking, and hence +research progress, within this important field going forward. + +
+
+ comment: AutoML 2023 (Best paper award) +
+
+
+
+
+ + ☆ OpinionGPT: Modelling Explicit Biases in Instruction-Tuned LLMs + + +
+ Instruction-tuned Large Language Models (LLMs) have recently showcased +remarkable ability to generate fitting responses to natural language +instructions. However, an open research question concerns the inherent biases +of trained models and their responses. For instance, if the data used to tune +an LLM is dominantly written by persons with a specific political bias, we +might expect generated answers to share this bias. Current research work seeks +to de-bias such models, or suppress potentially biased answers. With this +demonstration, we take a different view on biases in instruction-tuning: Rather +than aiming to suppress them, we aim to make them explicit and transparent. To +this end, we present OpinionGPT, a web demo in which users can ask questions +and select all biases they wish to investigate. The demo will answer this +question using a model fine-tuned on text representing each of the selected +biases, allowing side-by-side comparison. To train the underlying model, we +identified 11 different biases (political, geographic, gender, age) and derived +an instruction-tuning corpus in which each answer was written by members of one +of these demographics. This paper presents OpinionGPT, illustrates how we +trained the bias-aware model and showcases the web application (available at +https://opiniongpt.informatik.hu-berlin.de). + +
+
+ comment: 6 pages, 1 figure, 3 tables +
+
+
+
+
+ + ☆ A Tutorial on the Non-Asymptotic Theory of System Identification + + +
+ This tutorial serves as an introduction to recently developed non-asymptotic +methods in the theory of -- mainly linear -- system identification. We +emphasize tools we deem particularly useful for a range of problems in this +domain, such as the covering technique, the Hanson-Wright Inequality and the +method of self-normalized martingales. We then employ these tools to give +streamlined proofs of the performance of various least-squares based estimators +for identifying the parameters in autoregressive models. We conclude by +sketching out how the ideas presented herein can be extended to certain +nonlinear identification problems. + +
+
+
+
+
+ + ☆ CenTime: Event-Conditional Modelling of Censoring in Survival Analysis + + +
+ Survival analysis is a valuable tool for estimating the time until specific +events, such as death or cancer recurrence, based on baseline observations. +This is particularly useful in healthcare to prognostically predict clinically +important events based on patient data. However, existing approaches often have +limitations; some focus only on ranking patients by survivability, neglecting +to estimate the actual event time, while others treat the problem as a +classification task, ignoring the inherent time-ordered structure of the +events. Furthermore, the effective utilization of censored samples - training +data points where the exact event time is unknown - is essential for improving +the predictive accuracy of the model. In this paper, we introduce CenTime, a +novel approach to survival analysis that directly estimates the time to event. +Our method features an innovative event-conditional censoring mechanism that +performs robustly even when uncensored data is scarce. We demonstrate that our +approach forms a consistent estimator for the event model parameters, even in +the absence of uncensored data. Furthermore, CenTime is easily integrated with +deep learning models with no restrictions on batch size or the number of +uncensored samples. We compare our approach with standard survival analysis +methods, including the Cox proportional-hazard model and DeepHit. Our results +indicate that CenTime offers state-of-the-art performance in predicting +time-to-death while maintaining comparable ranking performance. Our +implementation is publicly available at +https://github.com/ahmedhshahin/CenTime. + +
+
+
+
+
+ + ☆ Mixtures of Gaussians are Privately Learnable with a Polynomial Number + of Samples + + +
+ We study the problem of estimating mixtures of Gaussians under the constraint +of differential privacy (DP). Our main result is that $\tilde{O}(k^2 d^4 +\log(1/\delta) / \alpha^2 \varepsilon)$ samples are sufficient to estimate a +mixture of $k$ Gaussians up to total variation distance $\alpha$ while +satisfying $(\varepsilon, \delta)$-DP. This is the first finite sample +complexity upper bound for the problem that does not make any structural +assumptions on the GMMs. + To solve the problem, we devise a new framework which may be useful for other +tasks. On a high level, we show that if a class of distributions (such as +Gaussians) is (1) list decodable and (2) admits a "locally small'' cover +[BKSW19] with respect to total variation distance, then the class of its +mixtures is privately learnable. The proof circumvents a known barrier +indicating that, unlike Gaussians, GMMs do not admit a locally small cover +[AAL21]. + +
+
+
+
+
+ + ☆ Gradient-Based Feature Learning under Structured Data + + +
+ Recent works have demonstrated that the sample complexity of gradient-based +learning of single index models, i.e. functions that depend on a 1-dimensional +projection of the input data, is governed by their information exponent. +However, these results are only concerned with isotropic data, while in +practice the input often contains additional structure which can implicitly +guide the algorithm. In this work, we investigate the effect of a spiked +covariance structure and reveal several interesting phenomena. First, we show +that in the anisotropic setting, the commonly used spherical gradient dynamics +may fail to recover the true direction, even when the spike is perfectly +aligned with the target direction. Next, we show that appropriate weight +normalization that is reminiscent of batch normalization can alleviate this +issue. Further, by exploiting the alignment between the (spiked) input +covariance and the target, we obtain improved sample complexity compared to the +isotropic case. In particular, under the spiked model with a suitably large +spike, the sample complexity of gradient-based training can be made independent +of the information exponent while also outperforming lower bounds for +rotationally invariant kernel methods. + +
+
+
+
+
+ + ☆ Early warning via transitions in latent stochastic dynamical systems + + +
+ Early warnings for dynamical transitions in complex systems or +high-dimensional observation data are essential in many real world +applications, such as gene mutation, brain diseases, natural disasters, +financial crises, and engineering reliability. To effectively extract early +warning signals, we develop a novel approach: the directed anisotropic +diffusion map that captures the latent evolutionary dynamics in low-dimensional +manifold. Applying the methodology to authentic electroencephalogram (EEG) +data, we successfully find the appropriate effective coordinates, and derive +early warning signals capable of detecting the tipping point during the state +transition. Our method bridges the latent dynamics with the original dataset. +The framework is validated to be accurate and effective through numerical +experiments, in terms of density and transition probability. It is shown that +the second coordinate holds meaningful information for critical transition in +various evaluation metrics. + +
+
+
+
+
+ + ☆ Bootstrapping Adaptive Human-Machine Interfaces with Offline + Reinforcement Learning IROS + + +
+ Adaptive interfaces can help users perform sequential decision-making tasks +like robotic teleoperation given noisy, high-dimensional command signals (e.g., +from a brain-computer interface). Recent advances in human-in-the-loop machine +learning enable such systems to improve by interacting with users, but tend to +be limited by the amount of data that they can collect from individual users in +practice. In this paper, we propose a reinforcement learning algorithm to +address this by training an interface to map raw command signals to actions +using a combination of offline pre-training and online fine-tuning. To address +the challenges posed by noisy command signals and sparse rewards, we develop a +novel method for representing and inferring the user's long-term intent for a +given trajectory. We primarily evaluate our method's ability to assist users +who can only communicate through noisy, high-dimensional input channels through +a user study in which 12 participants performed a simulated navigation task by +using their eye gaze to modulate a 128-dimensional command signal from their +webcam. The results show that our method enables successful goal navigation +more often than a baseline directional interface, by learning to denoise user +commands signals and provide shared autonomy assistance. We further evaluate on +a simulated Sawyer pushing task with eye gaze control, and the Lunar Lander +game with simulated user commands, and find that our method improves over +baseline interfaces in these domains as well. Extensive ablation experiments +with simulated user commands empirically motivate each component of our method. + +
+
+ comment: Accepted to IEEE/RSJ International Conference on Intelligent Robots + and Systems (IROS) 2023 +
+
+
+
+
+ + ☆ Cross-Task Attention Network: Improving Multi-Task Learning for Medical + Imaging Applications + + +
+ Multi-task learning (MTL) is a powerful approach in deep learning that +leverages the information from multiple tasks during training to improve model +performance. In medical imaging, MTL has shown great potential to solve various +tasks. However, existing MTL architectures in medical imaging are limited in +sharing information across tasks, reducing the potential performance +improvements of MTL. In this study, we introduce a novel attention-based MTL +framework to better leverage inter-task interactions for various tasks from +pixel-level to image-level predictions. Specifically, we propose a Cross-Task +Attention Network (CTAN) which utilizes cross-task attention mechanisms to +incorporate information by interacting across tasks. We validated CTAN on four +medical imaging datasets that span different domains and tasks including: +radiation treatment planning prediction using planning CT images of two +different target cancers (Prostate, OpenKBP); pigmented skin lesion +segmentation and diagnosis using dermatoscopic images (HAM10000); and COVID-19 +diagnosis and severity prediction using chest CT scans (STOIC). Our study +demonstrates the effectiveness of CTAN in improving the accuracy of medical +imaging tasks. Compared to standard single-task learning (STL), CTAN +demonstrated a 4.67% improvement in performance and outperformed both widely +used MTL baselines: hard parameter sharing (HPS) with an average performance +improvement of 3.22%; and multi-task attention network (MTAN) with a relative +decrease of 5.38%. These findings highlight the significance of our proposed +MTL framework in solving medical imaging tasks and its potential to improve +their accuracy across domains. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ☆ Learning from Demonstration via Probabilistic Diagrammatic Teaching + + +
+ Learning for Demonstration (LfD) enables robots to acquire new skills by +imitating expert demonstrations, allowing users to communicate their +instructions in an intuitive manner. Recent progress in LfD often relies on +kinesthetic teaching or teleoperation as the medium for users to specify the +demonstrations. Kinesthetic teaching requires physical handling of the robot, +while teleoperation demands proficiency with additional hardware. This paper +introduces an alternative paradigm for LfD called Diagrammatic Teaching. +Diagrammatic Teaching aims to teach robots novel skills by prompting the user +to sketch out demonstration trajectories on 2D images of the scene, these are +then synthesised as a generative model of motion trajectories in 3D task space. +Additionally, we present the Ray-tracing Probabilistic Trajectory Learning +(RPTL) framework for Diagrammatic Teaching. RPTL extracts time-varying +probability densities from the 2D sketches, applies ray-tracing to find +corresponding regions in 3D Cartesian space, and fits a probabilistic model of +motion trajectories to these regions. New motion trajectories, which mimic +those sketched by the user, can then be generated from the probabilistic model. +We empirically validate our framework both in simulation and on real robots, +which include a fixed-base manipulator and a quadruped-mounted manipulator. + +
+
+
+
+
+ + ☆ Uncovering Drift in Textual Data: An Unsupervised Method for Detecting + and Mitigating Drift in Machine Learning Models + + +
+ Drift in machine learning refers to the phenomenon where the statistical +properties of data or context, in which the model operates, change over time +leading to a decrease in its performance. Therefore, maintaining a constant +monitoring process for machine learning model performance is crucial in order +to proactively prevent any potential performance regression. However, +supervised drift detection methods require human annotation and consequently +lead to a longer time to detect and mitigate the drift. In our proposed +unsupervised drift detection method, we follow a two step process. Our first +step involves encoding a sample of production data as the target distribution, +and the model training data as the reference distribution. In the second step, +we employ a kernel-based statistical test that utilizes the maximum mean +discrepancy (MMD) distance metric to compare the reference and target +distributions and estimate any potential drift. Our method also identifies the +subset of production data that is the root cause of the drift. The models +retrained using these identified high drift samples show improved performance +on online customer experience quality metrics. + +
+
+ comment: 8 pages, Accepted in 2023 Amazon Internal Machine Learning Conference +
+
+
+
+
+ + ☆ ArtHDR-Net: Perceptually Realistic and Accurate HDR Content Creation SC + + +
+ High Dynamic Range (HDR) content creation has become an important topic for +modern media and entertainment sectors, gaming and Augmented/Virtual Reality +industries. Many methods have been proposed to recreate the HDR counterparts of +input Low Dynamic Range (LDR) images/videos given a single exposure or +multi-exposure LDRs. The state-of-the-art methods focus primarily on the +preservation of the reconstruction's structural similarity and the pixel-wise +accuracy. However, these conventional approaches do not emphasize preserving +the artistic intent of the images in terms of human visual perception, which is +an essential element in media, entertainment and gaming. In this paper, we +attempt to study and fill this gap. We propose an architecture called +ArtHDR-Net based on a Convolutional Neural Network that uses multi-exposed LDR +features as input. Experimental results show that ArtHDR-Net can achieve +state-of-the-art performance in terms of the HDR-VDP-2 score (i.e., mean +opinion score index) while reaching competitive performance in terms of PSNR +and SSIM. + +
+
+ comment: Accepted in Asia Pacific Signal and Information Processing + Association Annual Summit and Conference (APSIPA ASC), Taipei, Taiwan +
+
+
+
+
+ + ☆ Prime and Modulate Learning: Generation of forward models with signed + back-propagation and environmental cues + + +
+ Deep neural networks employing error back-propagation for learning can suffer +from exploding and vanishing gradient problems. Numerous solutions have been +proposed such as normalisation techniques or limiting activation functions to +linear rectifying units. In this work we follow a different approach which is +particularly applicable to closed-loop learning of forward models where +back-propagation makes exclusive use of the sign of the error signal to prime +the learning, whilst a global relevance signal modulates the rate of learning. +This is inspired by the interaction between local plasticity and a global +neuromodulation. For example, whilst driving on an empty road, one can allow +for slow step-wise optimisation of actions, whereas, at a busy junction, an +error must be corrected at once. Hence, the error is the priming signal and the +intensity of the experience is a modulating factor in the weight change. The +advantages of this Prime and Modulate paradigm is twofold: it is free from +normalisation and it makes use of relevant cues from the environment to enrich +the learning. We present a mathematical derivation of the learning rule in +z-space and demonstrate the real-time performance with a robotic platform. The +results show a significant improvement in the speed of convergence compared to +that of the conventional back-propagation. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ Training Acceleration of Low-Rank Decomposed Networks using Sequential + Freezing and Rank Quantization + + +
+ Low Rank Decomposition (LRD) is a model compression technique applied to the +weight tensors of deep learning models in order to reduce the number of +trainable parameters and computational complexity. However, due to high number +of new layers added to the architecture after applying LRD, it may not lead to +a high training/inference acceleration if the decomposition ranks are not small +enough. The issue is that using small ranks increases the risk of significant +accuracy drop after decomposition. In this paper, we propose two techniques for +accelerating low rank decomposed models without requiring to use small ranks +for decomposition. These methods include rank optimization and sequential +freezing of decomposed layers. We perform experiments on both convolutional and +transformer-based models. Experiments show that these techniques can improve +the model throughput up to 60% during training and 37% during inference when +combined together while preserving the accuracy close to that of the original +models + +
+
+
+
+
+ + ☆ Empirical Risk Minimization for Losses without Variance + + +
+ This paper considers an empirical risk minimization problem under +heavy-tailed settings, where data does not have finite variance, but only has +$p$-th moment with $p \in (1,2)$. Instead of using estimation procedure based +on truncated observed data, we choose the optimizer by minimizing the risk +value. Those risk values can be robustly estimated via using the remarkable +Catoni's method (Catoni, 2012). Thanks to the structure of Catoni-type +influence functions, we are able to establish excess risk upper bounds via +using generalized generic chaining methods. Moreover, we take computational +issues into consideration. We especially theoretically investigate two types of +optimization methods, robust gradient descent algorithm and empirical +risk-based methods. With an extensive numerical study, we find that the +optimizer based on empirical risks via Catoni-style estimation indeed shows +better performance than other baselines. It indicates that estimation directly +based on truncated data may lead to unsatisfactory results. + +
+
+
+
+
+ + ☆ AnthroNet: Conditional Generation of Humans via Anthropometrics + + +
+ We present a novel human body model formulated by an extensive set of +anthropocentric measurements, which is capable of generating a wide range of +human body shapes and poses. The proposed model enables direct modeling of +specific human identities through a deep generative architecture, which can +produce humans in any arbitrary pose. It is the first of its kind to have been +trained end-to-end using only synthetically generated data, which not only +provides highly accurate human mesh representations but also allows for precise +anthropometry of the body. Moreover, using a highly diverse animation library, +we articulated our synthetic humans' body and hands to maximize the diversity +of the learnable priors for model training. Our model was trained on a dataset +of $100k$ procedurally-generated posed human meshes and their corresponding +anthropometric measurements. Our synthetic data generator can be used to +generate millions of unique human identities and poses for non-commercial +academic research purposes. + +
+
+ comment: AnthroNet's Unity data generator source code is available at: + https://unity-technologies.github.io/AnthroNet/ +
+
+
+
+
+ + ☆ Improved theoretical guarantee for rank aggregation via spectral method + + +
+ Given pairwise comparisons between multiple items, how to rank them so that +the ranking matches the observations? This problem, known as rank aggregation, +has found many applications in sports, recommendation systems, and other web +applications. As it is generally NP-hard to find a global ranking that +minimizes the mismatch (known as the Kemeny optimization), we focus on the +Erd\"os-R\'enyi outliers (ERO) model for this ranking problem. Here, each +pairwise comparison is a corrupted copy of the true score difference. We +investigate spectral ranking algorithms that are based on unnormalized and +normalized data matrices. The key is to understand their performance in +recovering the underlying scores of each item from the observed data. This +reduces to deriving an entry-wise perturbation error bound between the top +eigenvectors of the unnormalized/normalized data matrix and its population +counterpart. By using the leave-one-out technique, we provide a sharper +$\ell_{\infty}$-norm perturbation bound of the eigenvectors and also derive an +error bound on the maximum displacement for each item, with only $\Omega(n\log +n)$ samples. Our theoretical analysis improves upon the state-of-the-art +results in terms of sample complexity, and our numerical experiments confirm +these theoretical findings. + +
+
+ comment: 29 pages, 6 figures +
+
+
+
+
+ + ☆ Pareto Frontiers in Neural Feature Learning: Data, Compute, Width, and + Luck + + +
+ This work investigates the nuanced algorithm design choices for deep learning +in the presence of computational-statistical gaps. We begin by considering +offline sparse parity learning, a supervised classification problem which +admits a statistical query lower bound for gradient-based training of a +multilayer perceptron. This lower bound can be interpreted as a multi-resource +tradeoff frontier: successful learning can only occur if one is sufficiently +rich (large model), knowledgeable (large dataset), patient (many training +iterations), or lucky (many random guesses). We show, theoretically and +experimentally, that sparse initialization and increasing network width yield +significant improvements in sample efficiency in this setting. Here, width +plays the role of parallel search: it amplifies the probability of finding +"lottery ticket" neurons, which learn sparse features more sample-efficiently. +Finally, we show that the synthetic sparse parity task can be useful as a proxy +for real problems requiring axis-aligned feature learning. We demonstrate +improved sample efficiency on tabular classification benchmarks by using wide, +sparsely-initialized MLP models; these networks sometimes outperform tuned +random forests. + +
+
+
+
+
+ + ☆ Conformal Autoregressive Generation: Beam Search with Coverage + Guarantees + + +
+ We introduce two new extensions to the beam search algorithm based on +conformal predictions (CP) to produce sets of sequences with theoretical +coverage guarantees. The first method is very simple and proposes +dynamically-sized subsets of beam search results but, unlike typical CP +procedures, has an upper bound on the achievable guarantee depending on a +post-hoc calibration measure. Our second algorithm introduces the conformal set +prediction procedure as part of the decoding process, producing a variable beam +width which adapts to the current uncertainty. While more complex, this +procedure can achieve coverage guarantees selected a priori. We provide +marginal coverage bounds for each method, and evaluate them empirically on a +selection of tasks drawing from natural language processing and chemistry. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ Adversarially Robust Deep Learning with Optimal-Transport-Regularized + Divergences + + +
+ We introduce the $ARMOR_D$ methods as novel approaches to enhancing the +adversarial robustness of deep learning models. These methods are based on a +new class of optimal-transport-regularized divergences, constructed via an +infimal convolution between an information divergence and an optimal-transport +(OT) cost. We use these as tools to enhance adversarial robustness by +maximizing the expected loss over a neighborhood of distributions, a technique +known as distributionally robust optimization. Viewed as a tool for +constructing adversarial samples, our method allows samples to be both +transported, according to the OT cost, and re-weighted, according to the +information divergence. We demonstrate the effectiveness of our method on +malware detection and image recognition applications and find that, to our +knowledge, it outperforms existing methods at enhancing the robustness against +adversarial attacks. $ARMOR_D$ yields the robustified accuracy of $98.29\%$ +against $FGSM$ and $98.18\%$ against $PGD^{40}$ on the MNIST dataset, reducing +the error rate by more than $19.7\%$ and $37.2\%$ respectively compared to +prior methods. Similarly, in malware detection, a discrete (binary) data +domain, $ARMOR_D$ improves the robustified accuracy under $rFGSM^{50}$ attack +compared to the previous best-performing adversarial training methods by +$37.0\%$ while lowering false negative and false positive rates by $51.1\%$ and +$57.53\%$, respectively. + +
+
+ comment: 30 pages, 5 figures +
+
+
+
+
+ + ☆ CPU frequency scheduling of real-time applications on embedded devices + with temporal encoding-based deep reinforcement learning + + +
+ Small devices are frequently used in IoT and smart-city applications to +perform periodic dedicated tasks with soft deadlines. This work focuses on +developing methods to derive efficient power-management methods for periodic +tasks on small devices. We first study the limitations of the existing Linux +built-in methods used in small devices. We illustrate three typical +workload/system patterns that are challenging to manage with Linux's built-in +solutions. We develop a reinforcement-learning-based technique with temporal +encoding to derive an effective DVFS governor even with the presence of the +three system patterns. The derived governor uses only one performance counter, +the same as the built-in Linux mechanism, and does not require an explicit task +model for the workload. We implemented a prototype system on the Nvidia Jetson +Nano Board and experimented with it with six applications, including two +self-designed and four benchmark applications. Under different deadline +constraints, our approach can quickly derive a DVFS governor that can adapt to +performance requirements and outperform the built-in Linux approach in energy +saving. On Mibench workloads, with performance slack ranging from 0.04 s to 0.4 +s, the proposed method can save 3% - 11% more energy compared to Ondemand. +AudioReg and FaceReg applications tested have 5%- 14% energy-saving +improvement. We have open-sourced the implementation of our in-kernel quantized +neural network engine. The codebase can be found at: +https://github.com/coladog/tinyagent. + +
+
+ comment: Accepted to Journal of Systems Architecture +
+
+
+
+
+ + ☆ Deep Learning Safety Concerns in Automated Driving Perception + + +
+ Recent advances in the field of deep learning and impressive performance of +deep neural networks (DNNs) for perception have resulted in an increased demand +for their use in automated driving (AD) systems. The safety of such systems is +of utmost importance and thus requires to consider the unique properties of +DNNs. + In order to achieve safety of AD systems with DNN-based perception components +in a systematic and comprehensive approach, so-called safety concerns have been +introduced as a suitable structuring element. On the one hand, the concept of +safety concerns is -- by design -- well aligned to existing standards relevant +for safety of AD systems such as ISO 21448 (SOTIF). On the other hand, it has +already inspired several academic publications and upcoming standards on AI +safety such as ISO PAS 8800. + While the concept of safety concerns has been previously introduced, this +paper extends and refines it, leveraging feedback from various domain and +safety experts in the field. In particular, this paper introduces an additional +categorization for a better understanding as well as enabling cross-functional +teams to jointly address the concerns. + +
+
+
+
+
+ + ☆ Neural lasso: a unifying approach of lasso and neural networks + + +
+ In recent years, there is a growing interest in combining techniques +attributed to the areas of Statistics and Machine Learning in order to obtain +the benefits of both approaches. In this article, the statistical technique +lasso for variable selection is represented through a neural network. It is +observed that, although both the statistical approach and its neural version +have the same objective function, they differ due to their optimization. In +particular, the neural version is usually optimized in one-step using a single +validation set, while the statistical counterpart uses a two-step optimization +based on cross-validation. The more elaborated optimization of the statistical +method results in more accurate parameter estimation, especially when the +training set is small. For this reason, a modification of the standard approach +for training neural networks, that mimics the statistical framework, is +proposed. During the development of the above modification, a new optimization +algorithm for identifying the significant variables emerged. Experimental +results, using synthetic and real data sets, show that this new optimization +algorithm achieves better performance than any of the three previous +optimization approaches. + +
+
+
+
+
+ + ☆ M(otion)-mode Based Prediction of Ejection Fraction using + Echocardiograms + + +
+ Early detection of cardiac dysfunction through routine screening is vital for +diagnosing cardiovascular diseases. An important metric of cardiac function is +the left ventricular ejection fraction (EF), where lower EF is associated with +cardiomyopathy. Echocardiography is a popular diagnostic tool in cardiology, +with ultrasound being a low-cost, real-time, and non-ionizing technology. +However, human assessment of echocardiograms for calculating EF is +time-consuming and expertise-demanding, raising the need for an automated +approach. In this work, we propose using the M(otion)-mode of echocardiograms +for estimating the EF and classifying cardiomyopathy. We generate multiple +artificial M-mode images from a single echocardiogram and combine them using +off-the-shelf model architectures. Additionally, we extend contrastive learning +(CL) to cardiac imaging to learn meaningful representations from exploiting +structures in unlabeled data allowing the model to achieve high accuracy, even +with limited annotations. Our experiments show that the supervised setting +converges with only ten modes and is comparable to the baseline method while +bypassing its cumbersome training process and being computationally much more +efficient. Furthermore, CL using M-mode images is helpful for limited data +scenarios, such as having labels for only 200 patients, which is common in +medical applications. + +
+
+ comment: Accepted at GCPR 2023 +
+
+
+
+
+ + ☆ TSGBench: Time Series Generation Benchmark + + +
+ Synthetic Time Series Generation (TSG) is crucial in a range of applications, +including data augmentation, anomaly detection, and privacy preservation. +Although significant strides have been made in this field, existing methods +exhibit three key limitations: (1) They often benchmark against similar model +types, constraining a holistic view of performance capabilities. (2) The use of +specialized synthetic and private datasets introduces biases and hampers +generalizability. (3) Ambiguous evaluation measures, often tied to custom +networks or downstream tasks, hinder consistent and fair comparison. + To overcome these limitations, we introduce \textsf{TSGBench}, the inaugural +TSG Benchmark, designed for a unified and comprehensive assessment of TSG +methods. It comprises three modules: (1) a curated collection of publicly +available, real-world datasets tailored for TSG, together with a standardized +preprocessing pipeline; (2) a comprehensive evaluation measures suite including +vanilla measures, new distance-based assessments, and visualization tools; (3) +a pioneering generalization test rooted in Domain Adaptation (DA), compatible +with all methods. We have conducted extensive experiments across ten real-world +datasets from diverse domains, utilizing ten advanced TSG methods and twelve +evaluation measures, all gauged through \textsf{TSGBench}. The results +highlight its remarkable efficacy and consistency. More importantly, +\textsf{TSGBench} delivers a statistical breakdown of method rankings, +illuminating performance variations across different datasets and measures, and +offering nuanced insights into the effectiveness of each method. + +
+
+ comment: 14 pages, 8 figures, and 4 tables +
+
+
+
+
+ + ☆ Convergence Analysis of Decentralized ASGD + + +
+ Over the last decades, Stochastic Gradient Descent (SGD) has been intensively +studied by the Machine Learning community. Despite its versatility and +excellent performance, the optimization of large models via SGD still is a +time-consuming task. To reduce training time, it is common to distribute the +training process across multiple devices. Recently, it has been shown that the +convergence of asynchronous SGD (ASGD) will always be faster than mini-batch +SGD. However, despite these improvements in the theoretical bounds, most ASGD +convergence-rate proofs still rely on a centralized parameter server, which is +prone to become a bottleneck when scaling out the gradient computations across +many distributed processes. + In this paper, we present a novel convergence-rate analysis for decentralized +and asynchronous SGD (DASGD) which does not require partial synchronization +among nodes nor restrictive network topologies. Specifically, we provide a +bound of $\mathcal{O}(\sigma\epsilon^{-2}) + +\mathcal{O}(QS_{avg}\epsilon^{-3/2}) + \mathcal{O}(S_{avg}\epsilon^{-1})$ for +the convergence rate of DASGD, where $S_{avg}$ is the average staleness between +models, $Q$ is a constant that bounds the norm of the gradients, and $\epsilon$ +is a (small) error that is allowed within the bound. Furthermore, when +gradients are not bounded, we prove the convergence rate of DASGD to be +$\mathcal{O}(\sigma\epsilon^{-2}) + +\mathcal{O}(\sqrt{\hat{S}_{avg}\hat{S}_{max}}\epsilon^{-1})$, with +$\hat{S}_{max}$ and $\hat{S}_{avg}$ representing a loose version of the average +and maximum staleness, respectively. Our convergence proof holds for a fixed +stepsize and any non-convex, homogeneous, and L-smooth objective function. We +anticipate that our results will be of high relevance for the adoption of DASGD +by a broad community of researchers and developers. + +
+
+
+
+
+ + ☆ Medoid Silhouette clustering with automatic cluster number selection + + +
+ The evaluation of clustering results is difficult, highly dependent on the +evaluated data set and the perspective of the beholder. There are many +different clustering quality measures, which try to provide a general measure +to validate clustering results. A very popular measure is the Silhouette. We +discuss the efficient medoid-based variant of the Silhouette, perform a +theoretical analysis of its properties, provide two fast versions for the +direct optimization, and discuss the use to choose the optimal number of +clusters. We combine ideas from the original Silhouette with the well-known PAM +algorithm and its latest improvements FasterPAM. One of the versions guarantees +equal results to the original variant and provides a run speedup of $O(k^2)$. +In experiments on real data with 30000 samples and $k$=100, we observed a +10464$\times$ speedup compared to the original PAMMEDSIL algorithm. +Additionally, we provide a variant to choose the optimal number of clusters +directly. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2209.12553 +
+
+
+
+
+ + ☆ Enhancing Pipeline-Based Conversational Agents with Large Language + Models + + +
+ The latest advancements in AI and deep learning have led to a breakthrough in +large language model (LLM)-based agents such as GPT-4. However, many commercial +conversational agent development tools are pipeline-based and have limitations +in holding a human-like conversation. This paper investigates the capabilities +of LLMs to enhance pipeline-based conversational agents during two phases: 1) +in the design and development phase and 2) during operations. In 1) LLMs can +aid in generating training data, extracting entities and synonyms, +localization, and persona design. In 2) LLMs can assist in contextualization, +intent classification to prevent conversational breakdown and handle +out-of-scope questions, auto-correcting utterances, rephrasing responses, +formulating disambiguation questions, summarization, and enabling closed +question-answering capabilities. We conducted informal experiments with GPT-4 +in the private banking domain to demonstrate the scenarios above with a +practical example. Companies may be hesitant to replace their pipeline-based +agents with LLMs entirely due to privacy concerns and the need for deep +integration within their existing ecosystems. A hybrid approach in which LLMs' +are integrated into the pipeline-based agents allows them to save time and +costs of building and running agents by capitalizing on the capabilities of +LLMs while retaining the integration and privacy safeguards of their existing +systems. + +
+
+
+
+
+ + ☆ Learning continuous-valued treatment effects through representation + balancing + + +
+ Estimating the effects of treatments with an associated dose on an instance's +outcome, the "dose response", is relevant in a variety of domains, from +healthcare to business, economics, and beyond. Such effects, also known as +continuous-valued treatment effects, are typically estimated from observational +data, which may be subject to dose selection bias. This means that the +allocation of doses depends on pre-treatment covariates. Previous studies have +shown that conventional machine learning approaches fail to learn accurate +individual estimates of dose responses under the presence of dose selection +bias. In this work, we propose CBRNet, a causal machine learning approach to +estimate an individual dose response from observational data. CBRNet adopts the +Neyman-Rubin potential outcome framework and extends the concept of balanced +representation learning for overcoming selection bias to continuous-valued +treatments. Our work is the first to apply representation balancing in a +continuous-valued treatment setting. We evaluate our method on a newly proposed +benchmark. Our experiments demonstrate CBRNet's ability to accurately learn +treatment effects under selection bias and competitive performance with respect +to other state-of-the-art methods. + +
+
+ comment: 24 pages, 8 figures +
+
+
+
+
+ + ☆ A Causal Perspective on Loan Pricing: Investigating the Impacts of + Selection Bias on Identifying Bid-Response Functions + + +
+ In lending, where prices are specific to both customers and products, having +a well-functioning personalized pricing policy in place is essential to +effective business making. Typically, such a policy must be derived from +observational data, which introduces several challenges. While the problem of +``endogeneity'' is prominently studied in the established pricing literature, +the problem of selection bias (or, more precisely, bid selection bias) is not. +We take a step towards understanding the effects of selection bias by posing +pricing as a problem of causal inference. Specifically, we consider the +reaction of a customer to price a treatment effect. In our experiments, we +simulate varying levels of selection bias on a semi-synthetic dataset on +mortgage loan applications in Belgium. We investigate the potential of +parametric and nonparametric methods for the identification of individual +bid-response functions. Our results illustrate how conventional methods such as +logistic regression and neural networks suffer adversely from selection bias. +In contrast, we implement state-of-the-art methods from causal machine learning +and show their capability to overcome selection bias in pricing data. + +
+
+ comment: 24 pages, 5 figures +
+
+
+
+
+ + ☆ A Natural Gas Consumption Forecasting System for Continual Learning + Scenarios based on Hoeffding Trees with Change Point Detection Mechanism + + +
+ Forecasting natural gas consumption, considering seasonality and trends, is +crucial in planning its supply and consumption and optimizing the cost of +obtaining it, mainly by industrial entities. However, in times of threats to +its supply, it is also a critical element that guarantees the supply of this +raw material to meet individual consumers' needs, ensuring society's energy +security. This article introduces a novel multistep ahead forecasting of +natural gas consumption with change point detection integration for model +collection selection with continual learning capabilities using data stream +processing. The performance of the forecasting models based on the proposed +approach is evaluated in a complex real-world use case of natural gas +consumption forecasting. We employed Hoeffding tree predictors as forecasting +models and the Pruned Exact Linear Time (PELT) algorithm for the change point +detection procedure. The change point detection integration enables selecting a +different model collection for successive time frames. Thus, three model +collection selection procedures (with and without an error feedback loop) are +defined and evaluated for forecasting scenarios with various densities of +detected change points. These models were compared with change point agnostic +baseline approaches. Our experiments show that fewer change points result in a +lower forecasting error regardless of the model collection selection procedure +employed. Also, simpler model collection selection procedures omitting +forecasting error feedback leads to more robust forecasting models suitable for +continual learning tasks. + +
+
+
+
+
+ + ☆ A State Representation for Diminishing Rewards + + +
+ A common setting in multitask reinforcement learning (RL) demands that an +agent rapidly adapt to various stationary reward functions randomly sampled +from a fixed distribution. In such situations, the successor representation +(SR) is a popular framework which supports rapid policy evaluation by +decoupling a policy's expected discounted, cumulative state occupancies from a +specific reward function. However, in the natural world, sequential tasks are +rarely independent, and instead reflect shifting priorities based on the +availability and subjective perception of rewarding stimuli. Reflecting this +disjunction, in this paper we study the phenomenon of diminishing marginal +utility and introduce a novel state representation, the $\lambda$ +representation ($\lambda$R) which, surprisingly, is required for policy +evaluation in this setting and which generalizes the SR as well as several +other state representations from the literature. We establish the $\lambda$R's +formal properties and examine its normative advantages in the context of +machine learning, as well as its usefulness for studying natural behaviors, +particularly foraging. + +
+
+
+
+
+ + ☆ Chat Failures and Troubles: Reasons and Solutions + + +
+ This paper examines some common problems in Human-Robot Interaction (HRI) +causing failures and troubles in Chat. A given use case's design decisions +start with the suitable robot, the suitable chatting model, identifying common +problems that cause failures, identifying potential solutions, and planning +continuous improvement. In conclusion, it is recommended to use a closed-loop +control algorithm that guides the use of trained Artificial Intelligence (AI) +pre-trained models and provides vocabulary filtering, re-train batched models +on new datasets, learn online from data streams, and/or use reinforcement +learning models to self-update the trained models and reduce errors. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ A Probabilistic Semi-Supervised Approach with Triplet Markov Chains SP 2023 + + +
+ Triplet Markov chains are general generative models for sequential data which +take into account three kinds of random variables: (noisy) observations, their +associated discrete labels and latent variables which aim at strengthening the +distribution of the observations and their associated labels. However, in +practice, we do not have at our disposal all the labels associated to the +observations to estimate the parameters of such models. In this paper, we +propose a general framework based on a variational Bayesian inference to train +parameterized triplet Markov chain models in a semi-supervised context. The +generality of our approach enables us to derive semi-supervised algorithms for +a variety of generative models for sequential Bayesian classification. + +
+
+ comment: Preprint submitted to IEEE MLSP 2023 +
+
+
+
+
+ + ☆ DiffDefense: Defending against Adversarial Attacks via Diffusion Models + + +
+ This paper presents a novel reconstruction method that leverages Diffusion +Models to protect machine learning classifiers against adversarial attacks, all +without requiring any modifications to the classifiers themselves. The +susceptibility of machine learning models to minor input perturbations renders +them vulnerable to adversarial attacks. While diffusion-based methods are +typically disregarded for adversarial defense due to their slow reverse +process, this paper demonstrates that our proposed method offers robustness +against adversarial threats while preserving clean accuracy, speed, and +plug-and-play compatibility. Code at: +https://github.com/HondamunigePrasannaSilva/DiffDefence. + +
+
+ comment: Paper published at ICIAP23 +
+
+
+
+
+ + ☆ Short-Term Load Forecasting Using A Particle-Swarm Optimized Multi-Head + Attention-Augmented CNN-LSTM Network + + +
+ Short-term load forecasting is of paramount importance in the efficient +operation and planning of power systems, given its inherent non-linear and +dynamic nature. Recent strides in deep learning have shown promise in +addressing this challenge. However, these methods often grapple with +hyperparameter sensitivity, opaqueness in interpretability, and high +computational overhead for real-time deployment. In this paper, I propose a +novel solution that surmounts these obstacles. Our approach harnesses the power +of the Particle-Swarm Optimization algorithm to autonomously explore and +optimize hyperparameters, a Multi-Head Attention mechanism to discern the +salient features crucial for accurate forecasting, and a streamlined framework +for computational efficiency. Our method undergoes rigorous evaluation using a +genuine electricity demand dataset. The results underscore its superiority in +terms of accuracy, robustness, and computational efficiency. Notably, our Mean +Absolute Percentage Error of 1.9376 marks a significant advancement over +existing state-of-the-art approaches, heralding a new era in short-term load +forecasting. + +
+
+
+
+
+ + ☆ A computationally lightweight safe learning algorithm + + +
+ Safety is an essential asset when learning control policies for physical +systems, as violating safety constraints during training can lead to expensive +hardware damage. In response to this need, the field of safe learning has +emerged with algorithms that can provide probabilistic safety guarantees +without knowledge of the underlying system dynamics. Those algorithms often +rely on Gaussian process inference. Unfortunately, Gaussian process inference +scales cubically with the number of data points, limiting applicability to +high-dimensional and embedded systems. In this paper, we propose a safe +learning algorithm that provides probabilistic safety guarantees but leverages +the Nadaraya-Watson estimator instead of Gaussian processes. For the +Nadaraya-Watson estimator, we can reach logarithmic scaling with the number of +data points. We provide theoretical guarantees for the estimates, embed them +into a safe learning algorithm, and show numerical experiments on a simulated +seven-degrees-of-freedom robot manipulator. + +
+
+ comment: Accepted final version to appear in: Proc. of the IEEE Conference on + Decision and Control +
+
+
+
+
+ + ☆ Dataset Generation and Bonobo Classification from Weakly Labelled Videos + + +
+ This paper presents a bonobo detection and classification pipeline built from +the commonly used machine learning methods. Such application is motivated by +the need to test bonobos in their enclosure using touch screen devices without +human assistance. This work introduces a newly acquired dataset based on bonobo +recordings generated semi-automatically. The recordings are weakly labelled and +fed to a macaque detector in order to spatially detect the individual present +in the video. Handcrafted features coupled with different classification +algorithms and deep-learning methods using a ResNet architecture are +investigated for bonobo identification. Performance is compared in terms of +classification accuracy on the splits of the database using different data +separation methods. We demonstrate the importance of data preparation and how a +wrong data separation can lead to false good results. Finally, after a +meaningful separation of the data, the best classification performance is +obtained using a fine-tuned ResNet model and reaches 75% of accuracy. + +
+
+ comment: IntelliSys 2023 paper +
+
+
+
+
+ + ☆ How adversarial attacks can disrupt seemingly stable accurate + classifiers + + +
+ Adversarial attacks dramatically change the output of an otherwise accurate +learning system using a seemingly inconsequential modification to a piece of +input data. Paradoxically, empirical evidence indicates that even systems which +are robust to large random perturbations of the input data remain susceptible +to small, easily constructed, adversarial perturbations of their inputs. Here, +we show that this may be seen as a fundamental feature of classifiers working +with high dimensional input data. We introduce a simple generic and +generalisable framework for which key behaviours observed in practical systems +arise with high probability -- notably the simultaneous susceptibility of the +(otherwise accurate) model to easily constructed adversarial attacks, and +robustness to random perturbations of the input data. We confirm that the same +phenomena are directly observed in practical neural networks trained on +standard image classification problems, where even large additive random noise +fails to trigger the adversarial instability of the network. A surprising +takeaway is that even small margins separating a classifier's decision surface +from training and testing data can hide adversarial susceptibility from being +detected using randomly sampled perturbations. Counterintuitively, using +additive noise during training or testing is therefore inefficient for +eradicating or detecting adversarial examples, and more demanding adversarial +training is required. + +
+
+ comment: 11 pages, 8 figures, additional supplementary materials +
+
+
+
+
+ + ☆ Alzheimer Disease Detection from Raman Spectroscopy of the Cerebrospinal + Fluid via Topological Machine Learning + + +
+ The cerebrospinal fluid (CSF) of 19 subjects who received a clinical +diagnosis of Alzheimer's disease (AD) as well as of 5 pathological controls +have been collected and analysed by Raman spectroscopy (RS). We investigated +whether the raw and preprocessed Raman spectra could be used to distinguish AD +from controls. First, we applied standard Machine Learning (ML) methods +obtaining unsatisfactory results. Then, we applied ML to a set of topological +descriptors extracted from raw spectra, achieving a very good classification +accuracy (>87%). Although our results are preliminary, they indicate that RS +and topological analysis together may provide an effective combination to +confirm or disprove a clinical diagnosis of AD. The next steps will include +enlarging the dataset of CSF samples to validate the proposed method better +and, possibly, to understand if topological data analysis could support the +characterization of AD subtypes. + +
+
+ comment: Accepter for inclusion in AITA 2023 (http://aita.isti.cnr.it/) +
+
+
+
+
+ + ☆ Towards Comparable Knowledge Distillation in Semantic Image Segmentation ECML + + +
+ Knowledge Distillation (KD) is one proposed solution to large model sizes and +slow inference speed in semantic segmentation. In our research we identify 25 +proposed distillation loss terms from 14 publications in the last 4 years. +Unfortunately, a comparison of terms based on published results is often +impossible, because of differences in training configurations. A good +illustration of this problem is the comparison of two publications from 2022. +Using the same models and dataset, Structural and Statistical Texture +Distillation (SSTKD) reports an increase of student mIoU of 4.54 and a final +performance of 29.19, while Adaptive Perspective Distillation (APD) only +improves student performance by 2.06 percentage points, but achieves a final +performance of 39.25. The reason for such extreme differences is often a +suboptimal choice of hyperparameters and a resulting underperformance of the +student model used as reference point. In our work, we reveal problems of +insufficient hyperparameter tuning by showing that distillation improvements of +two widely accepted frameworks, SKD and IFVD, vanish when hyperparameters are +optimized sufficiently. To improve comparability of future research in the +field, we establish a solid baseline for three datasets and two student models +and provide extensive information on hyperparameter tuning. We find that only +two out of eight techniques can compete with our simple baseline on the ADE20K +dataset. + +
+
+ comment: Accepted by the ECML PKDD 2023 workshop track: Simplification, + Compression, Efficiency, and Frugality for Artificial Intelligence (SCEFA). + This preprint has not undergone peer review or any post-submission + improvements or corrections +
+
+
+
+
+ + ☆ Characterizing Lipschitz Stability of GNN for Fairness + + +
+ The Lipschitz bound, a technique from robust statistics, can limit the +maximum changes in the output concerning the input, taking into account +associated irrelevant biased factors. It is an efficient and provable method +for examining the output stability of machine learning models without incurring +additional computation costs. Recently, Graph Neural Networks (GNNs), which +operate on non-Euclidean data, have gained significant attention. However, no +previous research has investigated the GNN Lipschitz bounds to shed light on +stabilizing model outputs, especially when working on non-Euclidean data with +inherent biases. Given the inherent biases in common graph data used for GNN +training, it poses a serious challenge to constraining the GNN output +perturbations induced by input biases, thereby safeguarding fairness during +training. Recently, despite the Lipschitz constant's use in controlling the +stability of Euclideanneural networks, the calculation of the precise Lipschitz +constant remains elusive for non-Euclidean neural networks like GNNs, +especially within fairness contexts. To narrow this gap, we begin with the +general GNNs operating on an attributed graph, and formulate a Lipschitz bound +to limit the changes in the output regarding biases associated with the input. +Additionally, we theoretically analyze how the Lipschitz constant of a GNN +model could constrain the output perturbations induced by biases learned from +data for fairness training. We experimentally validate the Lipschitz bound's +effectiveness in limiting biases of the model output. Finally, from a training +dynamics perspective, we demonstrate why the theoretical Lipschitz bound can +effectively guide the GNN training to better trade-off between accuracy and +fairness. + +
+
+
+
+
+ + ☆ Insights Into the Inner Workings of Transformer Models for Protein + Function Prediction + + +
+ Motivation: We explored how explainable AI (XAI) can help to shed light into +the inner workings of neural networks for protein function prediction, by +extending the widely used XAI method of integrated gradients such that latent +representations inside of transformer models, which were finetuned to Gene +Ontology term and Enzyme Commission number prediction, can be inspected too. +Results: The approach enabled us to identify amino acids in the sequences that +the transformers pay particular attention to, and to show that these relevant +sequence parts reflect expectations from biology and chemistry, both in the +embedding layer and inside of the model, where we identified transformer heads +with a statistically significant correspondence of attribution maps with ground +truth sequence annotations (e.g., transmembrane regions, active sites) across +many proteins. Availability and Implementation: Source code can be accessed at +https://github.com/markuswenzel/xai-proteins . + +
+
+ comment: 20 pages, 9 figures, 4 tables, source code available at + https://github.com/markuswenzel/xai-proteins +
+
+
+
+
+ + ☆ Understanding Self-Supervised Learning of Speech Representation via + Invariance and Redundancy Reduction ICASSP 2024 + + +
+ The choice of the objective function is crucial in emerging high-quality +representations from self-supervised learning. This paper investigates how +different formulations of the Barlow Twins (BT) objective impact downstream +task performance for speech data. We propose Modified Barlow Twins (MBT) with +normalized latents to enforce scale-invariance and evaluate on speaker +identification, gender recognition and keyword spotting tasks. Our results show +MBT improves representation generalization over original BT, especially when +fine-tuning with limited target data. This highlights the importance of +designing objectives that encourage invariant and transferable representations. +Our analysis provides insights into how the BT learning objective can be +tailored to produce speech representations that excel when adapted to new +downstream tasks. This study is an important step towards developing reusable +self-supervised speech representations. + +
+
+ comment: 6 pages, 1 figure, in submission to ICASSP 2024 +
+
+
+
+
+ + ☆ Filtration Surfaces for Dynamic Graph Classification + + +
+ Existing approaches for classifying dynamic graphs either lift graph kernels +to the temporal domain, or use graph neural networks (GNNs). However, current +baselines have scalability issues, cannot handle a changing node set, or do not +take edge weight information into account. We propose filtration surfaces, a +novel method that is scalable and flexible, to alleviate said restrictions. We +experimentally validate the efficacy of our model and show that filtration +surfaces outperform previous state-of-the-art baselines on datasets that rely +on edge weight information. Our method does so while being either completely +parameter-free or having at most one parameter, and yielding the lowest overall +standard deviation. + +
+
+
+
+
+ + ☆ Your Battery Is a Blast! Safeguarding Against Counterfeit Batteries with + Authentication + + +
+ Lithium-ion (Li-ion) batteries are the primary power source in various +applications due to their high energy and power density. Their market was +estimated to be up to 48 billion U.S. dollars in 2022. However, the widespread +adoption of Li-ion batteries has resulted in counterfeit cell production, which +can pose safety hazards to users. Counterfeit cells can cause explosions or +fires, and their prevalence in the market makes it difficult for users to +detect fake cells. Indeed, current battery authentication methods can be +susceptible to advanced counterfeiting techniques and are often not adaptable +to various cells and systems. In this paper, we improve the state of the art on +battery authentication by proposing two novel methodologies, DCAuth and +EISthentication, which leverage the internal characteristics of each cell +through Machine Learning models. Our methods automatically authenticate +lithium-ion battery models and architectures using data from their regular +usage without the need for any external device. They are also resilient to the +most common and critical counterfeit practices and can scale to several +batteries and devices. To evaluate the effectiveness of our proposed +methodologies, we analyze time-series data from a total of 20 datasets that we +have processed to extract meaningful features for our analysis. Our methods +achieve high accuracy in battery authentication for both architectures (up to +0.99) and models (up to 0.96). Moreover, our methods offer comparable +identification performances. By using our proposed methodologies, manufacturers +can ensure that devices only use legitimate batteries, guaranteeing the +operational state of any system and safety measures for the users. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ Interactive Hyperparameter Optimization in Multi-Objective Problems via + Preference Learning + + +
+ Hyperparameter optimization (HPO) is important to leverage the full potential +of machine learning (ML). In practice, users are often interested in +multi-objective (MO) problems, i.e., optimizing potentially conflicting +objectives, like accuracy and energy consumption. To tackle this, the vast +majority of MO-ML algorithms return a Pareto front of non-dominated machine +learning models to the user. Optimizing the hyperparameters of such algorithms +is non-trivial as evaluating a hyperparameter configuration entails evaluating +the quality of the resulting Pareto front. In literature, there are known +indicators that assess the quality of a Pareto front (e.g., hypervolume, R2) by +quantifying different properties (e.g., volume, proximity to a reference +point). However, choosing the indicator that leads to the desired Pareto front +might be a hard task for a user. In this paper, we propose a human-centered +interactive HPO approach tailored towards multi-objective ML leveraging +preference learning to extract desiderata from users that guide the +optimization. Instead of relying on the user guessing the most suitable +indicator for their needs, our approach automatically learns an appropriate +indicator. Concretely, we leverage pairwise comparisons of distinct Pareto +fronts to learn such an appropriate quality indicator. Then, we optimize the +hyperparameters of the underlying MO-ML algorithm towards this learned +indicator using a state-of-the-art HPO approach. In an experimental study +targeting the environmental impact of ML, we demonstrate that our approach +leads to substantially better Pareto fronts compared to optimizing based on a +wrong indicator pre-selected by the user, and performs comparable in the case +of an advanced user knowing which indicator to pick. + +
+
+
+
+
+ + ☆ DTW+S: Shape-based Comparison of Time-series with Ordered Local Trend + + +
+ Measuring distance or similarity between time-series data is a fundamental +aspect of many applications including classification and clustering. Existing +measures may fail to capture similarities due to local trends (shapes) and may +even produce misleading results. Our goal is to develop a measure that looks +for similar trends occurring around similar times and is easily interpretable +for researchers in applied domains. This is particularly useful for +applications where time-series have a sequence of meaningful local trends that +are ordered, such as in epidemics (a surge to an increase to a peak to a +decrease). We propose a novel measure, DTW+S, which creates an interpretable +"closeness-preserving" matrix representation of the time-series, where each +column represents local trends, and then it applies Dynamic Time Warping to +compute distances between these matrices. We present a theoretical analysis +that supports the choice of this representation. We demonstrate the utility of +DTW+S in ensemble building and clustering of epidemic curves. We also +demonstrate that our approach results in better classification compared to +Dynamic Time Warping for a class of datasets, particularly when local trends +rather than scale play a decisive role. + +
+
+ comment: 11 pages, 13 figures +
+
+
+
+
+ + ☆ Sparse Federated Training of Object Detection in the Internet of + Vehicles + + +
+ As an essential component part of the Intelligent Transportation System +(ITS), the Internet of Vehicles (IoV) plays a vital role in alleviating traffic +issues. Object detection is one of the key technologies in the IoV, which has +been widely used to provide traffic management services by analyzing timely and +sensitive vehicle-related information. However, the current object detection +methods are mostly based on centralized deep training, that is, the sensitive +data obtained by edge devices need to be uploaded to the server, which raises +privacy concerns. To mitigate such privacy leakage, we first propose a +federated learning-based framework, where well-trained local models are shared +in the central server. However, since edge devices usually have limited +computing power, plus a strict requirement of low latency in IoVs, we further +propose a sparse training process on edge devices, which can effectively +lighten the model, and ensure its training efficiency on edge devices, thereby +reducing communication overheads. In addition, due to the diverse computing +capabilities and dynamic environment, different sparsity rates are applied to +edge devices. To further guarantee the performance, we propose, FedWeg, an +improved aggregation scheme based on FedAvg, which is designed by the inverse +ratio of sparsity rates. Experiments on the real-life dataset using YOLO show +that the proposed scheme can achieve the required object detection rate while +saving considerable communication costs. + +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Supervised Learning vs Large Language Models + for Identifying Cognitive Distortions and Suicidal Risks in Chinese Social + Media + + +
+ Large language models, particularly those akin to the rapidly progressing GPT +series, are gaining traction for their expansive influence. While there is keen +interest in their applicability within medical domains such as psychology, +tangible explorations on real-world data remain scant. Concurrently, users on +social media platforms are increasingly vocalizing personal sentiments; under +specific thematic umbrellas, these sentiments often manifest as negative +emotions, sometimes escalating to suicidal inclinations. Timely discernment of +such cognitive distortions and suicidal risks is crucial to effectively +intervene and potentially avert dire circumstances. Our study ventured into +this realm by experimenting on two pivotal tasks: suicidal risk and cognitive +distortion identification on Chinese social media platforms. Using supervised +learning as a baseline, we examined and contrasted the efficacy of large +language models via three distinct strategies: zero-shot, few-shot, and +fine-tuning. Our findings revealed a discernible performance gap between the +large language models and traditional supervised learning approaches, primarily +attributed to the models' inability to fully grasp subtle categories. Notably, +while GPT-4 outperforms its counterparts in multiple scenarios, GPT-3.5 shows +significant enhancement in suicide risk classification after fine-tuning. To +our knowledge, this investigation stands as the maiden attempt at gauging large +language models on Chinese social media tasks. This study underscores the +forward-looking and transformative implications of using large language models +in the field of psychology. It lays the groundwork for future applications in +psychological research and practice. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Trinary Decision Trees for missing value handling + + +
+ This paper introduces the Trinary decision tree, an algorithm designed to +improve the handling of missing data in decision tree regressors and +classifiers. Unlike other approaches, the Trinary decision tree does not assume +that missing values contain any information about the response. Both +theoretical calculations on estimator bias and numerical illustrations using +real data sets are presented to compare its performance with established +algorithms in different missing data scenarios (Missing Completely at Random +(MCAR), and Informative Missingness (IM)). Notably, the Trinary tree +outperforms its peers in MCAR settings, especially when data is only missing +out-of-sample, while lacking behind in IM settings. A hybrid model, the +TrinaryMIA tree, which combines the Trinary tree and the Missing In Attributes +(MIA) approach, shows robust performance in all types of missingness. Despite +the potential drawback of slower training speed, the Trinary tree offers a +promising and more accurate method of handling missing data in decision tree +algorithms. + +
+
+
+
+
+ + ☆ On the dynamics of multi agent nonlinear filtering and learning + + +
+ Multiagent systems aim to accomplish highly complex learning tasks through +decentralised consensus seeking dynamics and their use has garnered a great +deal of attention in the signal processing and computational intelligence +societies. This article examines the behaviour of multiagent networked systems +with nonlinear filtering/learning dynamics. To this end, a general formulation +for the actions of an agent in multiagent networked systems is presented and +conditions for achieving a cohesive learning behaviour is given. Importantly, +application of the so derived framework in distributed and federated learning +scenarios are presented. + +
+
+
+
+
+ + ☆ MVD:A Novel Methodology and Dataset for Acoustic Vehicle Type + Classification + + +
+ Rising urban populations have led to a surge in vehicle use and made traffic +monitoring and management indispensable. Acoustic traffic monitoring (ATM) +offers a cost-effective and efficient alternative to more computationally +expensive methods of monitoring traffic such as those involving computer vision +technologies. In this paper, we present MVD and MVDA: two open datasets for the +development of acoustic traffic monitoring and vehicle-type classification +algorithms, which contain audio recordings of moving vehicles. The dataset +contain four classes- Trucks, Cars, Motorbikes, and a No-vehicle class. +Additionally, we propose a novel and efficient way to accurately classify these +acoustic signals using cepstrum and spectrum based local and global audio +features, and a multi-input neural network. Experimental results show that our +methodology improves upon the established baselines of previous works and +achieves an accuracy of 91.98% and 96.66% on MVD and MVDA Datasets, +respectively. Finally, the proposed model was deployed through an Android +application to make it accessible for testing and demonstrate its efficacy. + +
+
+
+
+
+ + ☆ Subgraph-based Tight Frames on Graphs with Compact Supports and + Vanishing Moments + + +
+ In this work, we proposed a novel and general method to construct tight +frames on graphs with compact supports based on a series of hierarchical +partitions. Starting from our abstract construction that generalizes previous +methods based on partition trees, we are able to flexibly incorporate subgraph +Laplacians into our design of graph frames. Consequently, our general methods +permit adjusting the (subgraph) vanishing moments of the framelets and extra +properties, such as directionality, for efficiently representing graph signals +with path-like supports. Several variants are explicitly defined and tested. +Experimental results show our proposed graph frames perform superiorly in +non-linear approximation tasks. + +
+
+
+
+
+ + ☆ Feature Enhancer Segmentation Network (FES-Net) for Vessel Segmentation + + +
+ Diseases such as diabetic retinopathy and age-related macular degeneration +pose a significant risk to vision, highlighting the importance of precise +segmentation of retinal vessels for the tracking and diagnosis of progression. +However, existing vessel segmentation methods that heavily rely on +encoder-decoder structures struggle to capture contextual information about +retinal vessel configurations, leading to challenges in reconciling semantic +disparities between encoder and decoder features. To address this, we propose a +novel feature enhancement segmentation network (FES-Net) that achieves accurate +pixel-wise segmentation without requiring additional image enhancement steps. +FES-Net directly processes the input image and utilizes four prompt +convolutional blocks (PCBs) during downsampling, complemented by a shallow +upsampling approach to generate a binary mask for each class. We evaluate the +performance of FES-Net on four publicly available state-of-the-art datasets: +DRIVE, STARE, CHASE, and HRF. The evaluation results clearly demonstrate the +superior performance of FES-Net compared to other competitive approaches +documented in the existing literature. + +
+
+
+
+
+ + ☆ A Robust Negative Learning Approach to Partial Domain Adaptation Using + Source Prototypes + + +
+ This work proposes a robust Partial Domain Adaptation (PDA) framework that +mitigates the negative transfer problem by incorporating a robust +target-supervision strategy. It leverages ensemble learning and includes +diverse, complementary label feedback, alleviating the effect of incorrect +feedback and promoting pseudo-label refinement. Rather than relying exclusively +on first-order moments for distribution alignment, our approach offers explicit +objectives to optimize intra-class compactness and inter-class separation with +the inferred source prototypes and highly-confident target samples in a +domain-invariant fashion. Notably, we ensure source data privacy by eliminating +the need to access the source data during the adaptation phase through a priori +inference of source prototypes. We conducted a series of comprehensive +experiments, including an ablation analysis, covering a range of partial domain +adaptation tasks. Comprehensive evaluations on benchmark datasets corroborate +our framework's enhanced robustness and generalization, demonstrating its +superiority over existing state-of-the-art PDA approaches. + +
+
+
+
+
+ + ☆ Efficient Single Object Detection on Image Patches with Early Exit + Enhanced High-Precision CNNs + + +
+ This paper proposes a novel approach for detecting objects using mobile +robots in the context of the RoboCup Standard Platform League, with a primary +focus on detecting the ball. The challenge lies in detecting a dynamic object +in varying lighting conditions and blurred images caused by fast movements. To +address this challenge, the paper presents a convolutional neural network +architecture designed specifically for computationally constrained robotic +platforms. The proposed CNN is trained to achieve high precision classification +of single objects in image patches and to determine their precise spatial +positions. The paper further integrates Early Exits into the existing +high-precision CNN architecture to reduce the computational cost of easily +rejectable cases in the background class. The training process involves a +composite loss function based on confidence and positional losses with dynamic +weighting and data augmentation. The proposed approach achieves a precision of +100% on the validation dataset and a recall of almost 87%, while maintaining an +execution time of around 170 $\mu$s per hypotheses. By combining the proposed +approach with an Early Exit, a runtime optimization of more than 28%, on +average, can be achieved compared to the original CNN. Overall, this paper +provides an efficient solution for an enhanced detection of objects, especially +the ball, in computationally constrained robotic platforms. + +
+
+
+
+
+ + ☆ Privacy-preserving Continual Federated Clustering via Adaptive Resonance + Theory + + +
+ With the increasing importance of data privacy protection, various +privacy-preserving machine learning methods have been proposed. In the +clustering domain, various algorithms with a federated learning framework +(i.e., federated clustering) have been actively studied and showed high +clustering performance while preserving data privacy. However, most of the base +clusterers (i.e., clustering algorithms) used in existing federated clustering +algorithms need to specify the number of clusters in advance. These algorithms, +therefore, are unable to deal with data whose distributions are unknown or +continually changing. To tackle this problem, this paper proposes a +privacy-preserving continual federated clustering algorithm. In the proposed +algorithm, an adaptive resonance theory-based clustering algorithm capable of +continual learning is used as a base clusterer. Therefore, the proposed +algorithm inherits the ability of continual learning. Experimental results with +synthetic and real-world datasets show that the proposed algorithm has superior +clustering performance to state-of-the-art federated clustering algorithms +while realizing data privacy protection and continual learning ability. The +source code is available at \url{https://github.com/Masuyama-lab/FCAC}. + +
+
+ comment: This paper is currently under review. arXiv admin note: substantial + text overlap with arXiv:2305.01507 +
+
+
+
+
+ + ☆ Fast FixMatch: Faster Semi-Supervised Learning with Curriculum Batch + Size + + +
+ Advances in Semi-Supervised Learning (SSL) have almost entirely closed the +gap between SSL and Supervised Learning at a fraction of the number of labels. +However, recent performance improvements have often come \textit{at the cost of +significantly increased training computation}. To address this, we propose +Curriculum Batch Size (CBS), \textit{an unlabeled batch size curriculum which +exploits the natural training dynamics of deep neural networks.} A small +unlabeled batch size is used in the beginning of training and is gradually +increased to the end of training. A fixed curriculum is used regardless of +dataset, model or number of epochs, and reduced training computations is +demonstrated on all settings. We apply CBS, strong labeled augmentation, +Curriculum Pseudo Labeling (CPL) \citep{FlexMatch} to FixMatch \citep{FixMatch} +and term the new SSL algorithm Fast FixMatch. We perform an ablation study to +show that strong labeled augmentation and/or CPL do not significantly reduce +training computations, but, in synergy with CBS, they achieve optimal +performance. Fast FixMatch also achieves substantially higher data utilization +compared to previous state-of-the-art. Fast FixMatch achieves between +$2.1\times$ - $3.4\times$ reduced training computations on CIFAR-10 with all +but 40, 250 and 4000 labels removed, compared to vanilla FixMatch, while +attaining the same cited state-of-the-art error rate \citep{FixMatch}. Similar +results are achieved for CIFAR-100, SVHN and STL-10. Finally, Fast MixMatch +achieves between $2.6\times$ - $3.3\times$ reduced training computations in +federated SSL tasks and online/streaming learning SSL tasks, which further +demonstrate the generializbility of Fast MixMatch to different scenarios and +tasks. + +
+
+
+
+
+ + ☆ Cross-Image Context Matters for Bongard Problems + + +
+ Current machine learning methods struggle to solve Bongard problems, which +are a type of IQ test that requires deriving an abstract "concept" from a set +of positive and negative "support" images, and then classifying whether or not +a new query image depicts the key concept. On Bongard-HOI, a benchmark for +natural-image Bongard problems, existing methods have only reached 66% accuracy +(where chance is 50%). Low accuracy is often attributed to neural nets' lack of +ability to find human-like symbolic rules. In this work, we point out that many +existing methods are forfeiting accuracy due to a much simpler problem: they do +not incorporate information contained in the support set as a whole, and rely +instead on information extracted from individual supports. This is a critical +issue, because unlike in few-shot learning tasks concerning object +classification, the "key concept" in a typical Bongard problem can only be +distinguished using multiple positives and multiple negatives. We explore a +variety of simple methods to take this cross-image context into account, and +demonstrate substantial gains over prior methods, leading to new +state-of-the-art performance on Bongard-LOGO (75.3%) and Bongard-HOI (72.45%) +and strong performance on the original Bongard problem set (60.84%). + +
+
+ comment: Main paper: 7 pages, Appendix: 10 pages, 30 figures. Code: + https://github.com/nraghuraman/bongard-context +
+
+
+
+
+ + ☆ Multi-Modality Guidance Network For Missing Modality Inference + + +
+ Multimodal models have gained significant success in recent years. Standard +multimodal approaches often assume unchanged modalities from training stage to +inference stage. In practice, however, many scenarios fail to satisfy such +assumptions with missing modalities during inference, leading to limitations on +where multimodal models can be applied. While existing methods mitigate the +problem through reconstructing the missing modalities, it increases unnecessary +computational cost, which could be just as critical, especially for large, +deployed systems. To solve the problem from both sides, we propose a novel +guidance network that promotes knowledge sharing during training, taking +advantage of the multimodal representations to train better single-modality +models for inference. Real-life experiment in violence detection shows that our +proposed framework trains single-modality models that significantly outperform +its traditionally trained counterparts while maintaining the same inference +cost. + +
+
+
+
+
+ + ☆ Cross-domain Sound Recognition for Efficient Underwater Data Analysis + + +
+ This paper presents a novel deep learning approach for analyzing massive +underwater acoustic data by leveraging a model trained on a broad spectrum of +non-underwater (aerial) sounds. Recognizing the challenge in labeling vast +amounts of underwater data, we propose a two-fold methodology to accelerate +this labor-intensive procedure. + The first part of our approach involves PCA and UMAP visualization of the +underwater data using the feature vectors of an aerial sound recognition model. +This enables us to cluster the data in a two dimensional space and listen to +points within these clusters to understand their defining characteristics. This +innovative method simplifies the process of selecting candidate labels for +further training. + In the second part, we train a neural network model using both the selected +underwater data and the non-underwater dataset. We conducted a quantitative +analysis to measure the precision, recall, and F1 score of our model for +recognizing airgun sounds, a common type of underwater sound. The F1 score +achieved by our model exceeded 84.3%, demonstrating the effectiveness of our +approach in analyzing underwater acoustic data. + The methodology presented in this paper holds significant potential to reduce +the amount of labor required in underwater data analysis and opens up new +possibilities for further research in the field of cross-domain data analysis. + +
+
+ comment: Accepted to APSIPA 2023 +
+
+
+
+
+ + ☆ XGen-7B Technical Report + + +
+ Large Language Models (LLMs) have become ubiquitous across various domains, +transforming the way we interact with information and conduct research. +However, most high-performing LLMs remain confined behind proprietary walls, +hindering scientific progress. Most open-source LLMs, on the other hand, are +limited in their ability to support longer sequence lengths, which is a key +requirement for many tasks that require inference over an input context. To +address this, we have trained XGen, a series of 7B parameter models on up to 8K +sequence length for up to 1.5T tokens. We have also finetuned the XGen models +on public-domain instructional data, creating their instruction-tuned +counterparts (XGen-Inst). We open-source our models for both research +advancements and commercial applications. Our evaluation on standard benchmarks +shows that XGen models achieve comparable or better results when compared with +state-of-the-art open-source LLMs. Our targeted evaluation on long sequence +modeling tasks shows the benefits of our 8K-sequence models over 2K-sequence +open-source LLMs. + +
+
+
+
+
+ + ☆ Broadband Ground Motion Synthesis via Generative Adversarial Neural + Operators: Development and Validation + + +
+ We present a data-driven model for ground-motion synthesis using a Generative +Adversarial Neural Operator (GANO) that combines recent advancements in machine +learning and open access strong motion data sets to generate three-component +acceleration time histories conditioned on moment magnitude ($M$), rupture +distance ($R_{rup}$), time-average shear-wave velocity at the top $30m$ +($V_{S30}$), and tectonic environment or style of faulting. We use Neural +Operators, a resolution invariant architecture that guarantees that the model +training is independent of the data sampling frequency. We first present the +conditional ground-motion synthesis algorithm (referred to heretofore as +cGM-GANO) and discuss its advantages compared to previous work. Next, we verify +the cGM-GANO framework using simulated ground motions generated with the +Southern California Earthquake Center (SCEC) Broadband Platform (BBP). We +lastly train cGM-GANO on a KiK-net dataset from Japan, showing that the +framework can recover the magnitude, distance, and $V_{S30}$ scaling of Fourier +amplitude and pseudo-spectral accelerations. We evaluate cGM-GANO through +residual analysis with the empirical dataset as well as by comparison with +conventional Ground Motion Models (GMMs) for selected ground motion scenarios. +Results show that cGM-GANO produces consistent median scaling with the GMMs for +the corresponding tectonic environments. The largest misfit is observed at +short distances due to the scarcity of training data. With the exception of +short distances, the aleatory variability of the response spectral ordinates is +also well captured, especially for subduction events due to the adequacy of +training data. Applications of the presented framework include generation of +risk-targeted ground motions for site-specific engineering applications. + +
+
+
+
+
+ + ☆ Punctate White Matter Lesion Segmentation in Preterm Infants Powered by + Counterfactually Generative Learning MICCAI + + +
+ Accurate segmentation of punctate white matter lesions (PWMLs) are +fundamental for the timely diagnosis and treatment of related developmental +disorders. Automated PWMLs segmentation from infant brain MR images is +challenging, considering that the lesions are typically small and low-contrast, +and the number of lesions may dramatically change across subjects. Existing +learning-based methods directly apply general network architectures to this +challenging task, which may fail to capture detailed positional information of +PWMLs, potentially leading to severe under-segmentations. In this paper, we +propose to leverage the idea of counterfactual reasoning coupled with the +auxiliary task of brain tissue segmentation to learn fine-grained positional +and morphological representations of PWMLs for accurate localization and +segmentation. A simple and easy-to-implement deep-learning framework (i.e., +DeepPWML) is accordingly designed. It combines the lesion counterfactual map +with the tissue probability map to train a lightweight PWML segmentation +network, demonstrating state-of-the-art performance on a real-clinical dataset +of infant T1w MR images. The code is available at +\href{https://github.com/ladderlab-xjtu/DeepPWML}{https://github.com/ladderlab-xjtu/DeepPWML}. + +
+
+ comment: 10 pages, 3 figures, Medical Image Computing and Computer Assisted + Intervention(MICCAI) +
+
+
+
+
+ + ☆ Personalized Tucker Decomposition: Modeling Commonality and Peculiarity + on Tensor Data + + +
+ We propose personalized Tucker decomposition (perTucker) to address the +limitations of traditional tensor decomposition methods in capturing +heterogeneity across different datasets. perTucker decomposes tensor data into +shared global components and personalized local components. We introduce a mode +orthogonality assumption and develop a proximal gradient regularized block +coordinate descent algorithm that is guaranteed to converge to a stationary +point. By learning unique and common representations across datasets, we +demonstrate perTucker's effectiveness in anomaly detection, client +classification, and clustering through a simulation study and two case studies +on solar flare detection and tonnage signal classification. + +
+
+
+
+
+ + ☆ Byzantine-Robust Federated Learning with Variance Reduction and + Differential Privacy + + +
+ Federated learning (FL) is designed to preserve data privacy during model +training, where the data remains on the client side (i.e., IoT devices), and +only model updates of clients are shared iteratively for collaborative +learning. However, this process is vulnerable to privacy attacks and Byzantine +attacks: the local model updates shared throughout the FL network will leak +private information about the local training data, and they can also be +maliciously crafted by Byzantine attackers to disturb the learning. In this +paper, we propose a new FL scheme that guarantees rigorous privacy and +simultaneously enhances system robustness against Byzantine attacks. Our +approach introduces sparsification- and momentum-driven variance reduction into +the client-level differential privacy (DP) mechanism, to defend against +Byzantine attackers. The security design does not violate the privacy guarantee +of the client-level DP mechanism; hence, our approach achieves the same +client-level DP guarantee as the state-of-the-art. We conduct extensive +experiments on both IID and non-IID datasets and different tasks and evaluate +the performance of our approach against different Byzantine attacks by +comparing it with state-of-the-art defense methods. The results of our +experiments show the efficacy of our framework and demonstrate its ability to +improve system robustness against Byzantine attacks while achieving a strong +privacy guarantee. + +
+
+
+
+
+ + ☆ Equal Long-term Benefit Rate: Adapting Static Fairness Notions to + Sequential Decision Making + + +
+ Decisions made by machine learning models may have lasting impacts over time, +making long-term fairness a crucial consideration. It has been shown that when +ignoring the long-term effect, naively imposing fairness criterion in static +settings can actually exacerbate bias over time. To explicitly address biases +in sequential decision-making, recent works formulate long-term fairness +notions in Markov Decision Process (MDP) framework. They define the long-term +bias to be the sum of static bias over each time step. However, we demonstrate +that naively summing up the step-wise bias can cause a false sense of fairness +since it fails to consider the importance difference of different time steps +during transition. In this work, we introduce a long-term fairness notion +called Equal Long-term Benefit Rate (ELBERT), which explicitly considers +varying temporal importance and adapts static fairness principles to the +sequential setting. Moreover, we show that the policy gradient of Long-term +Benefit Rate can be analytically reduced to standard policy gradient. This +makes standard policy optimization methods applicable for reducing the bias, +leading to our proposed bias mitigation method ELBERT-PO. Experiments on three +sequential decision making environments show that ELBERT-PO significantly +reduces bias and maintains high utility. Code is available at +https://github.com/Yuancheng-Xu/ELBERT. + +
+
+
+
+
+ + ☆ Large Language Models as Optimizers + + +
+ Optimization is ubiquitous. While derivative-based algorithms have been +powerful tools for various problems, the absence of gradient imposes challenges +on many real-world applications. In this work, we propose Optimization by +PROmpting (OPRO), a simple and effective approach to leverage large language +models (LLMs) as optimizers, where the optimization task is described in +natural language. In each optimization step, the LLM generates new solutions +from the prompt that contains previously generated solutions with their values, +then the new solutions are evaluated and added to the prompt for the next +optimization step. We first showcase OPRO on linear regression and traveling +salesman problems, then move on to prompt optimization where the goal is to +find instructions that maximize the task accuracy. With a variety of LLMs, we +demonstrate that the best prompts optimized by OPRO outperform human-designed +prompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks. + +
+
+
+
+
+ + ♻ ☆ Transformers as Support Vector Machines + + +
+ Since its inception in "Attention Is All You Need", transformer architecture +has led to revolutionary advancements in NLP. The attention layer within the +transformer admits a sequence of input tokens $X$ and makes them interact +through pairwise similarities computed as softmax$(XQK^\top X^\top)$, where +$(K,Q)$ are the trainable key-query parameters. In this work, we establish a +formal equivalence between the optimization geometry of self-attention and a +hard-margin SVM problem that separates optimal input tokens from non-optimal +tokens using linear constraints on the outer-products of token pairs. This +formalism allows us to characterize the implicit bias of 1-layer transformers +optimized with gradient descent: (1) Optimizing the attention layer with +vanishing regularization, parameterized by $(K,Q)$, converges in direction to +an SVM solution minimizing the nuclear norm of the combined parameter +$W=KQ^\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm +objective. We characterize this convergence, highlighting that it can occur +toward locally-optimal directions rather than global ones. (2) Complementing +this, we prove the local/global directional convergence of gradient descent +under suitable geometric conditions. Importantly, we show that +over-parameterization catalyzes global convergence by ensuring the feasibility +of the SVM problem and by guaranteeing a benign optimization landscape devoid +of stationary points. (3) While our theory applies primarily to linear +prediction heads, we propose a more general SVM equivalence that predicts the +implicit bias with nonlinear heads. Our findings are applicable to arbitrary +datasets and their validity is verified via experiments. We also introduce +several open problems and research directions. We believe these findings +inspire the interpretation of transformers as a hierarchy of SVMs that +separates and selects optimal tokens. + +
+
+ comment: minor edits and update global convergence figure +
+
+
+
+
+ + ♻ ☆ Q-Learning for MDPs with General Spaces: Convergence and Near Optimality + via Quantization under Weak Continuity + + +
+ Reinforcement learning algorithms often require finiteness of state and +action spaces in Markov decision processes (MDPs) (also called controlled +Markov chains) and various efforts have been made in the literature towards the +applicability of such algorithms for continuous state and action spaces. In +this paper, we show that under very mild regularity conditions (in particular, +involving only weak continuity of the transition kernel of an MDP), Q-learning +for standard Borel MDPs via quantization of states and actions (called +Quantized Q-Learning) converges to a limit, and furthermore this limit +satisfies an optimality equation which leads to near optimality with either +explicit performance bounds or which are guaranteed to be asymptotically +optimal. Our approach builds on (i) viewing quantization as a measurement +kernel and thus a quantized MDP as a partially observed Markov decision process +(POMDP), (ii) utilizing near optimality and convergence results of Q-learning +for POMDPs, and (iii) finally, near-optimality of finite state model +approximations for MDPs with weakly continuous kernels which we show to +correspond to the fixed point of the constructed POMDP. Thus, our paper +presents a very general convergence and approximation result for the +applicability of Q-learning for continuous MDPs. + +
+
+
+
+
+ + ♻ ☆ Max-Margin Token Selection in Attention Mechanism + + +
+ Attention mechanism is a central component of the transformer architecture +which led to the phenomenal success of large language models. However, the +theoretical principles underlying the attention mechanism are poorly +understood, especially its nonconvex optimization dynamics. In this work, we +explore the seminal softmax-attention model $f(\boldsymbol{X})=\langle +\boldsymbol{Xv}, \texttt{softmax}(\boldsymbol{XWp})\rangle$, where +$\boldsymbol{X}$ is the token sequence and +$(\boldsymbol{v},\boldsymbol{W},\boldsymbol{p})$ are trainable parameters. We +prove that running gradient descent on $\boldsymbol{p}$, or equivalently +$\boldsymbol{W}$, converges in direction to a max-margin solution that +separates $\textit{locally-optimal}$ tokens from non-optimal ones. This clearly +formalizes attention as an optimal token selection mechanism. Remarkably, our +results are applicable to general data and precisely characterize +$\textit{optimality}$ of tokens in terms of the value embeddings +$\boldsymbol{Xv}$ and problem geometry. We also provide a broader +regularization path analysis that establishes the margin maximizing nature of +attention even for nonlinear prediction heads. When optimizing $\boldsymbol{v}$ +and $\boldsymbol{p}$ simultaneously with logistic loss, we identify conditions +under which the regularization paths directionally converge to their respective +hard-margin SVM solutions where $\boldsymbol{v}$ separates the input features +based on their labels. Interestingly, the SVM formulation of $\boldsymbol{p}$ +is influenced by the support vector geometry of $\boldsymbol{v}$. Finally, we +verify our theoretical findings via numerical experiments and provide insights. + +
+
+ comment: minor edits and update convergence analysis figure +
+
+
+
+
+ + ♻ ☆ Non-inferiority of Deep Learning Acute Ischemic Stroke Segmentation on + Non-Contrast CT Compared to Expert Neuroradiologists + + +
+ To determine if a convolutional neural network (CNN) deep learning model can +accurately segment acute ischemic changes on non-contrast CT compared to +neuroradiologists. Non-contrast CT (NCCT) examinations from 232 acute ischemic +stroke patients who were enrolled in the DEFUSE 3 trial were included in this +study. Three experienced neuroradiologists independently segmented hypodensity +that reflected the ischemic core on each scan. The neuroradiologist with the +most experience (expert A) served as the ground truth for deep learning model +training. Two additional neuroradiologists (experts B and C) segmentations were +used for data testing. The 232 studies were randomly split into training and +test sets. The training set was further randomly divided into 5 folds with +training and validation sets. A 3-dimensional CNN architecture was trained and +optimized to predict the segmentations of expert A from NCCT. The performance +of the model was assessed using a set of volume, overlap, and distance metrics +using non-inferiority thresholds of 20%, 3ml, and 3mm. The optimized model +trained on expert A was compared to test experts B and C. We used a one-sided +Wilcoxon signed-rank test to test for the non-inferiority of the model-expert +compared to the inter-expert agreement. The final model performance for the +ischemic core segmentation task reached a performance of 0.46+-0.09 Surface +Dice at Tolerance 5mm and 0.47+-0.13 Dice when trained on expert A. Compared to +the two test neuroradiologists the model-expert agreement was non-inferior to +the inter-expert agreement, p < 0.05. The CNN accurately delineates the +hypodense ischemic core on NCCT in acute ischemic stroke patients with an +accuracy comparable to neuroradiologists. + +
+
+
+
+
+ + ♻ ☆ Explanation Shift: How Did the Distribution Shift Impact the Model? + + +
+ As input data distributions evolve, the predictive performance of machine +learning models tends to deteriorate. In practice, new input data tend to come +without target labels. Then, state-of-the-art techniques model input data +distributions or model prediction distributions and try to understand issues +regarding the interactions between learned models and shifting distributions. +We suggest a novel approach that models how explanation characteristics shift +when affected by distribution shifts. We find that the modeling of explanation +shifts can be a better indicator for detecting out-of-distribution model +behaviour than state-of-the-art techniques. We analyze different types of +distribution shifts using synthetic examples and real-world data sets. We +provide an algorithmic method that allows us to inspect the interaction between +data set features and learned models and compare them to the state-of-the-art. +We release our methods in an open-source Python package, as well as the code +used to reproduce our experiments. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2210.12369 +
+
+
+
+
+ + ♻ ☆ A Generalized Bandsplit Neural Network for Cinematic Audio Source + Separation ICASSP + + +
+ Cinematic audio source separation is a relatively new subtask of audio source +separation, with the aim of extracting the dialogue stem, the music stem, and +the effects stem from their mixture. In this work, we developed a model +generalizing the Bandsplit RNN for any complete or overcomplete partitions of +the frequency axis. Psycho-acoustically motivated frequency scales were used to +inform the band definitions which are now defined with redundancy for more +reliable feature extraction. A loss function motivated by the signal-to-noise +ratio and the sparsity-promoting property of the 1-norm was proposed. We +additionally exploit the information-sharing property of a common-encoder setup +to reduce computational complexity during both training and inference, improve +separation performance for hard-to-generalize classes of sounds, and allow +flexibility during inference time with easily detachable decoders. Our best +model sets the state of the art on the Divide and Remaster dataset with +performance above the ideal ratio mask for the dialogue stem. + +
+
+ comment: Submitted to ICASSP-OJSP 2024 +
+
+
+
+
+ + ♻ ☆ Auto-SDE: Learning effective reduced dynamics from data-driven + stochastic dynamical systems + + +
+ Multiscale stochastic dynamical systems have been widely adopted to +scientific and engineering problems due to their capability of depicting +complex phenomena in many real world applications. This work is devoted to +investigating the effective reduced dynamics for a slow-fast stochastic +dynamical system. Given observation data on a short-term period satisfying some +unknown slow-fast stochastic system, we propose a novel algorithm including a +neural network called Auto-SDE to learn invariant slow manifold. Our approach +captures the evolutionary nature of a series of time-dependent autoencoder +neural networks with the loss constructed from a discretized stochastic +differential equation. Our algorithm is also proved to be accurate, stable and +effective through numerical experiments under various evaluation metrics. + +
+
+
+
+
+ + ♻ ☆ DeepAD: A Robust Deep Learning Model of Alzheimer's Disease Progression + for Real-World Clinical Applications + + +
+ The ability to predict the future trajectory of a patient is a key step +toward the development of therapeutics for complex diseases such as Alzheimer's +disease (AD). However, most machine learning approaches developed for +prediction of disease progression are either single-task or single-modality +models, which can not be directly adopted to our setting involving multi-task +learning with high dimensional images. Moreover, most of those approaches are +trained on a single dataset (i.e. cohort), which can not be generalized to +other cohorts. We propose a novel multimodal multi-task deep learning model to +predict AD progression by analyzing longitudinal clinical and neuroimaging data +from multiple cohorts. Our proposed model integrates high dimensional MRI +features from a 3D convolutional neural network with other data modalities, +including clinical and demographic information, to predict the future +trajectory of patients. Our model employs an adversarial loss to alleviate the +study-specific imaging bias, in particular the inter-study domain shifts. In +addition, a Sharpness-Aware Minimization (SAM) optimization technique is +applied to further improve model generalization. The proposed model is trained +and tested on various datasets in order to evaluate and validate the results. +Our results showed that 1) our model yields significant improvement over the +baseline models, and 2) models using extracted neuroimaging features from 3D +convolutional neural network outperform the same models when applied to +MRI-derived volumetric features. + +
+
+
+
+
+ + ♻ ☆ USE-Evaluator: Performance Metrics for Medical Image Segmentation Models + with Uncertain, Small or Empty Reference Annotations + + +
+ Performance metrics for medical image segmentation models are used to measure +the agreement between the reference annotation and the predicted segmentation. +Usually, overlap metrics, such as the Dice, are used as a metric to evaluate +the performance of these models in order for results to be comparable. However, +there is a mismatch between the distributions of cases and difficulty level of +segmentation tasks in public data sets compared to clinical practice. Common +metrics fail to measure the impact of this mismatch, especially for clinical +data sets that include low signal pathologies, a difficult segmentation task, +and uncertain, small, or empty reference annotations. This limitation may +result in ineffective research of machine learning practitioners in designing +and optimizing models. Dimensions of evaluating clinical value include +consideration of the uncertainty of reference annotations, independence from +reference annotation volume size, and evaluation of classification of empty +reference annotations. We study how uncertain, small, and empty reference +annotations influence the value of metrics for medical image segmentation on an +in-house data set regardless of the model. We examine metrics behavior on the +predictions of a standard deep learning framework in order to identify metrics +with clinical value. We compare to a public benchmark data set (BraTS 2019) +with a high-signal pathology and certain, larger, and no empty reference +annotations. We may show machine learning practitioners, how uncertain, small, +or empty reference annotations require a rethinking of the evaluation and +optimizing procedures. The evaluation code was released to encourage further +analysis of this topic. +https://github.com/SophieOstmeier/UncertainSmallEmpty.git + +
+
+ comment: 16 pages, 10 figures, Published in Medical Image Analysis +
+
+
+
+
+ + ♻ ☆ Polynomial Bounds for Learning Noisy Optical Physical Unclonable + Functions and Connections to Learning With Errors + + +
+ It is shown that a class of optical physical unclonable functions (PUFs) can +be learned to arbitrary precision with arbitrarily high probability, even in +the presence of noise, given access to polynomially many challenge-response +pairs and polynomially bounded computational power, under mild assumptions +about the distributions of the noise and challenge vectors. This extends the +results of Rh\"uramir et al. (2013), who showed a subset of this class of PUFs +to be learnable in polynomial time in the absence of noise, under the +assumption that the optics of the PUF were either linear or had negligible +nonlinear effects. We derive polynomial bounds for the required number of +samples and the computational complexity of a linear regression algorithm, +based on size parameters of the PUF, the distributions of the challenge and +noise vectors, and the probability and accuracy of the regression algorithm, +with a similar analysis to one done by Bootle et al. (2018), who demonstrated a +learning attack on a poorly implemented version of the Learning With Errors +problem. + +
+
+ comment: 10 pages, 2 figures, submitted to IEEE Transactions on Information + Forensics and Security +
+
+
+
+
+ + ♻ ☆ Off-policy Evaluation in Doubly Inhomogeneous Environments + + +
+ This work aims to study off-policy evaluation (OPE) under scenarios where two +key reinforcement learning (RL) assumptions -- temporal stationarity and +individual homogeneity are both violated. To handle the ``double +inhomogeneities", we propose a class of latent factor models for the reward and +observation transition functions, under which we develop a general OPE +framework that consists of both model-based and model-free approaches. To our +knowledge, this is the first paper that develops statistically sound OPE +methods in offline RL with double inhomogeneities. It contributes to a deeper +understanding of OPE in environments, where standard RL assumptions are not +met, and provides several practical approaches in these settings. We establish +the theoretical properties of the proposed value estimators and empirically +show that our approach outperforms competing methods that ignore either +temporal nonstationarity or individual heterogeneity. Finally, we illustrate +our method on a data set from the Medical Information Mart for Intensive Care. + +
+
+
+
+
+ + ♻ ☆ Domain Generalization for Mammographic Image Analysis with Contrastive + Learning + + +
+ The deep learning technique has been shown to be effectively addressed +several image analysis tasks in the computer-aided diagnosis scheme for +mammography. The training of an efficacious deep learning model requires large +data with diverse styles and qualities. The diversity of data often comes from +the use of various scanners of vendors. But, in practice, it is impractical to +collect a sufficient amount of diverse data for training. To this end, a novel +contrastive learning is developed to equip the deep learning models with better +style generalization capability. Specifically, the multi-style and multi-view +unsupervised self-learning scheme is carried out to seek robust feature +embedding against style diversity as a pretrained model. Afterward, the +pretrained network is further fine-tuned to the downstream tasks, e.g., mass +detection, matching, BI-RADS rating, and breast density classification. The +proposed method has been evaluated extensively and rigorously with mammograms +from various vendor style domains and several public datasets. The experimental +results suggest that the proposed domain generalization method can effectively +improve performance of four mammographic image tasks on the data from both seen +and unseen domains, and outperform many state-of-the-art (SOTA) generalization +methods. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2111.10827 +
+
+
+
+
+ + ♻ ☆ Global Optimization for Cardinality-constrained Minimum Sum-of-Squares + Clustering via Semidefinite Programming + + +
+ The minimum sum-of-squares clustering (MSSC), or k-means type clustering, has +been recently extended to exploit prior knowledge on the cardinality of each +cluster. Such knowledge is used to increase performance as well as solution +quality. In this paper, we propose a global optimization approach based on the +branch-and-cut technique to solve the cardinality-constrained MSSC. For the +lower bound routine, we use the semidefinite programming (SDP) relaxation +recently proposed by Rujeerapaiboon et al. [SIAM J. Optim. 29(2), 1211-1239, +(2019)]. However, this relaxation can be used in a branch-and-cut method only +for small-size instances. Therefore, we derive a new SDP relaxation that scales +better with the instance size and the number of clusters. In both cases, we +strengthen the bound by adding polyhedral cuts. Benefiting from a tailored +branching strategy which enforces pairwise constraints, we reduce the +complexity of the problems arising in the children nodes. For the upper bound, +instead, we present a local search procedure that exploits the solution of the +SDP relaxation solved at each node. Computational results show that the +proposed algorithm globally solves, for the first time, real-world instances of +size 10 times larger than those solved by state-of-the-art exact methods. + +
+
+
+
+
+ + ♻ ☆ PGFed: Personalize Each Client's Global Objective for Federated Learning ICCV 2023 + + +
+ Personalized federated learning has received an upsurge of attention due to +the mediocre performance of conventional federated learning (FL) over +heterogeneous data. Unlike conventional FL which trains a single global +consensus model, personalized FL allows different models for different clients. +However, existing personalized FL algorithms only implicitly transfer the +collaborative knowledge across the federation by embedding the knowledge into +the aggregated model or regularization. We observed that this implicit +knowledge transfer fails to maximize the potential of each client's empirical +risk toward other clients. Based on our observation, in this work, we propose +Personalized Global Federated Learning (PGFed), a novel personalized FL +framework that enables each client to personalize its own global objective by +explicitly and adaptively aggregating the empirical risks of itself and other +clients. To avoid massive (O(N^2)) communication overhead and potential privacy +leakage while achieving this, each client's risk is estimated through a +first-order approximation for other clients' adaptive risk aggregation. On top +of PGFed, we develop a momentum upgrade, dubbed PGFedMo, to more efficiently +utilize clients' empirical risks. Our extensive experiments on four datasets +under different federated settings show consistent improvements of PGFed over +previous state-of-the-art methods. The code is publicly available at +https://github.com/ljaiverson/pgfed. + +
+
+ comment: ICCV 2023 oral +
+
+
+
+
+ + ♻ ☆ GraPhSyM: Graph Physical Synthesis Model + + +
+ In this work, we introduce GraPhSyM, a Graph Attention Network (GATv2) model +for fast and accurate estimation of post-physical synthesis circuit delay and +area metrics from pre-physical synthesis circuit netlists. Once trained, +GraPhSyM provides accurate visibility of final design metrics to early EDA +stages, such as logic synthesis, without running the slow physical synthesis +flow, enabling global co-optimization across stages. Additionally, the swift +and precise feedback provided by GraPhSyM is instrumental for +machine-learning-based EDA optimization frameworks. Given a gate-level netlist +of a circuit represented as a graph, GraPhSyM utilizes graph structure, +connectivity, and electrical property features to predict the impact of +physical synthesis transformations such as buffer insertion and gate sizing. +When trained on a dataset of 6000 prefix adder designs synthesized at an +aggressive delay target, GraPhSyM can accurately predict the post-synthesis +delay (98.3%) and area (96.1%) metrics of unseen adders with a fast 0.22s +inference time. Furthermore, we illustrate the compositionality of GraPhSyM by +employing the model trained on a fixed delay target to accurately anticipate +post-synthesis metrics at a variety of unseen delay targets. Lastly, we report +promising generalization capabilities of the GraPhSyM model when it is +evaluated on circuits different from the adders it was exclusively trained on. +The results show the potential for GraPhSyM to serve as a powerful tool for +advanced optimization techniques and as an oracle for EDA machine learning +frameworks. + +
+
+ comment: Accepted at Proceedings of the 42nd International Conference on + Computer-Aided Design (ICCAD), 2023 +
+
+
+
+
+ + ♻ ☆ Copula Representations and Error Surface Projections for the Exclusive + Or Problem + + +
+ The exclusive or (xor) function is one of the simplest examples that +illustrate why nonlinear feedforward networks are superior to linear regression +for machine learning applications. We review the xor representation and +approximation problems and discuss their solutions in terms of probabilistic +logic and associative copula functions. After briefly reviewing the +specification of feedforward networks, we compare the dynamics of learned error +surfaces with different activation functions such as RELU and tanh through a +set of colorful three-dimensional charts. The copula representations extend xor +from Boolean to real values, thereby providing a convenient way to demonstrate +the concept of cross-validation on in-sample and out-sample data sets. Our +approach is pedagogical and is meant to be a machine learning prolegomenon. + +
+
+
+
+
+ + ♻ ☆ Bridging the Gap Between Target Networks and Functional Regularization + + +
+ Bootstrapping is behind much of the successes of deep Reinforcement Learning. +However, learning the value function via bootstrapping often leads to unstable +training due to fast-changing target values. Target Networks are employed to +stabilize training by using an additional set of lagging parameters to estimate +the target values. Despite the popularity of Target Networks, their effect on +the optimization is still misunderstood. In this work, we show that they act as +an implicit regularizer which can be beneficial in some cases, but also have +disadvantages such as being inflexible and can result in instabilities, even +when vanilla TD(0) converges. To overcome these issues, we propose an explicit +Functional Regularization alternative that is flexible and a convex regularizer +in function space and we theoretically study its convergence. We conduct an +experimental study across a range of environments, discount factors, and +off-policiness data collections to investigate the effectiveness of the +regularization induced by Target Networks and Functional Regularization in +terms of performance, accuracy, and stability. Our findings emphasize that +Functional Regularization can be used as a drop-in replacement for Target +Networks and result in performance improvement. Furthermore, adjusting both the +regularization weight and the network update period in Functional +Regularization can result in further performance improvements compared to +solely adjusting the network update period as typically done with Target +Networks. Our approach also enhances the ability to networks to recover +accurate $Q$-values. + +
+
+ comment: The first two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Natural and Robust Walking using Reinforcement Learning without + Demonstrations in High-Dimensional Musculoskeletal Models + + +
+ Humans excel at robust bipedal walking in complex natural environments. In +each step, they adequately tune the interaction of biomechanical muscle +dynamics and neuronal signals to be robust against uncertainties in ground +conditions. However, it is still not fully understood how the nervous system +resolves the musculoskeletal redundancy to solve the multi-objective control +problem considering stability, robustness, and energy efficiency. In computer +simulations, energy minimization has been shown to be a successful optimization +target, reproducing natural walking with trajectory optimization or +reflex-based control methods. However, these methods focus on particular +motions at a time and the resulting controllers are limited when compensating +for perturbations. In robotics, reinforcement learning~(RL) methods recently +achieved highly stable (and efficient) locomotion on quadruped systems, but the +generation of human-like walking with bipedal biomechanical models has required +extensive use of expert data sets. This strong reliance on demonstrations often +results in brittle policies and limits the application to new behaviors, +especially considering the potential variety of movements for high-dimensional +musculoskeletal models in 3D. Achieving natural locomotion with RL without +sacrificing its incredible robustness might pave the way for a novel approach +to studying human walking in complex natural environments. Videos: +https://sites.google.com/view/naturalwalkingrl + +
+
+
+
+
+ + ♻ ☆ Deep Video Codec Control + + +
+ Lossy video compression is commonly used when transmitting and storing video +data. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard, +despite the availability of advanced (neural) compression approaches. +Transmitting videos in the face of dynamic network bandwidth conditions +requires video codecs to adapt to vastly different compression strengths. Rate +control modules augment the codec's compression such that bandwidth constraints +are satisfied and video distortion is minimized. While, both standard video +codes and their rate control modules are developed to minimize video distortion +w.r.t. human quality assessment, preserving the downstream performance of deep +vision models is not considered. In this paper, we present the first end-to-end +learnable deep video codec control considering both bandwidth constraints and +downstream vision performance, while not breaking existing standardization. We +demonstrate for two common vision tasks (semantic segmentation and optical flow +estimation) and on two different datasets that our deep codec control better +preserves downstream performance than using 2-pass average bit rate control +while meeting dynamic bandwidth constraints and adhering to standardizations. + +
+
+ comment: 22 pages, 26 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Truncated Diffusion Probabilistic Models and Diffusion-based Adversarial + Auto-Encoders ICLR 2023 + + +
+ Employing a forward diffusion chain to gradually map the data to a noise +distribution, diffusion-based generative models learn how to generate the data +by inferring a reverse diffusion chain. However, this approach is slow and +costly because it needs many forward and reverse steps. We propose a faster and +cheaper approach that adds noise not until the data become pure random noise, +but until they reach a hidden noisy data distribution that we can confidently +learn. Then, we use fewer reverse steps to generate data by starting from this +hidden distribution that is made similar to the noisy data. We reveal that the +proposed model can be cast as an adversarial auto-encoder empowered by both the +diffusion process and a learnable implicit prior. Experimental results show +even with a significantly smaller number of reverse diffusion steps, the +proposed truncated diffusion probabilistic models can provide consistent +improvements over the non-truncated ones in terms of performance in both +unconditional and text-guided image generations. + +
+
+ comment: ICLR 2023 camera-ready version +
+
+
+
+
+ + ♻ ☆ Enhancing Deep Learning Models through Tensorization: A Comprehensive + Survey and Framework + + +
+ The burgeoning growth of public domain data and the increasing complexity of +deep learning model architectures have underscored the need for more efficient +data representation and analysis techniques. This paper is motivated by the +work of Helal (2023) and aims to present a comprehensive overview of +tensorization. This transformative approach bridges the gap between the +inherently multidimensional nature of data and the simplified 2-dimensional +matrices commonly used in linear algebra-based machine learning algorithms. +This paper explores the steps involved in tensorization, multidimensional data +sources, various multiway analysis methods employed, and the benefits of these +approaches. A small example of Blind Source Separation (BSS) is presented +comparing 2-dimensional algorithms and a multiway algorithm in Python. Results +indicate that multiway analysis is more expressive. Contrary to the intuition +of the dimensionality curse, utilising multidimensional datasets in their +native form and applying multiway analysis methods grounded in multilinear +algebra reveal a profound capacity to capture intricate interrelationships +among various dimensions while, surprisingly, reducing the number of model +parameters and accelerating processing. A survey of the multi-away analysis +methods and integration with various Deep Neural Networks models is presented +using case studies in different domains. + +
+
+ comment: 30 pages, 8 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Evaluating Explanation Methods for Multivariate Time Series + Classification ALT + + +
+ Multivariate time series classification is an important computational task +arising in applications where data is recorded over time and over multiple +channels. For example, a smartwatch can record the acceleration and orientation +of a person's motion, and these signals are recorded as multivariate time +series. We can classify this data to understand and predict human movement and +various properties such as fitness levels. In many applications classification +alone is not enough, we often need to classify but also understand what the +model learns (e.g., why was a prediction given, based on what information in +the data). The main focus of this paper is on analysing and evaluating +explanation methods tailored to Multivariate Time Series Classification (MTSC). +We focus on saliency-based explanation methods that can point out the most +relevant channels and time series points for the classification decision. We +analyse two popular and accurate multivariate time series classifiers, ROCKET +and dResNet, as well as two popular explanation methods, SHAP and dCAM. We +study these methods on 3 synthetic datasets and 2 real-world datasets and +provide a quantitative and qualitative analysis of the explanations provided. +We find that flattening the multivariate datasets by concatenating the channels +works as well as using multivariate classifiers directly and adaptations of +SHAP for MTSC work quite well. Additionally, we also find that the popular +synthetic datasets we used are not suitable for time series analysis. + +
+
+ comment: Accepted at AALTD '23 (8th International Workshop on Advanced + Analytics and Learning on Temporal Data, ECMLPKDD 2023) +
+
+
+
+
+ + ♻ ☆ Graph Fairing Convolutional Networks for Anomaly Detection + + +
+ Graph convolution is a fundamental building block for many deep neural +networks on graph-structured data. In this paper, we introduce a simple, yet +very effective graph convolutional network with skip connections for +semi-supervised anomaly detection. The proposed layerwise propagation rule of +our model is theoretically motivated by the concept of implicit fairing in +geometry processing, and comprises a graph convolution module for aggregating +information from immediate node neighbors and a skip connection module for +combining layer-wise neighborhood representations. This propagation rule is +derived from the iterative solution of the implicit fairing equation via the +Jacobi method. In addition to capturing information from distant graph nodes +through skip connections between the network's layers, our approach exploits +both the graph structure and node features for learning discriminative node +representations. These skip connections are integrated by design in our +proposed network architecture. The effectiveness of our model is demonstrated +through extensive experiments on five benchmark datasets, achieving better or +comparable anomaly detection results against strong baseline methods. We also +demonstrate through an ablation study that skip connection helps improve the +model performance. + +
+
+
+
+
+ + ♻ ☆ Revisiting Hidden Representations in Transfer Learning for Medical + Imaging + + +
+ While a key component to the success of deep learning is the availability of +massive amounts of training data, medical image datasets are often limited in +diversity and size. Transfer learning has the potential to bridge the gap +between related yet different domains. For medical applications, however, it +remains unclear whether it is more beneficial to pre-train on natural or +medical images. We aim to shed light on this problem by comparing +initialization on ImageNet and RadImageNet on seven medical classification +tasks. Our work includes a replication study, which yields results contrary to +previously published findings. In our experiments, ResNet50 models pre-trained +on ImageNet tend to outperform those trained on RadImageNet. To gain further +insights, we investigate the learned representations using Canonical +Correlation Analysis (CCA) and compare the predictions of the different models. +Our results indicate that, contrary to intuition, ImageNet and RadImageNet may +converge to distinct intermediate representations, which appear to diverge +further during fine-tuning. Despite these distinct representations, the +predictions of the models remain similar. Our findings show that the similarity +between networks before and after fine-tuning does not correlate with +performance gains, suggesting that the advantages of transfer learning might +not solely originate from the reuse of features in the early layers of a +convolutional neural network. + +
+
+ comment: Submitted to TMLR +
+
+
+
+
+ + ♻ ☆ Emoji Promotes Developer Participation and Issue Resolution on GitHub AAAI + + +
+ Although remote working is increasingly adopted during the pandemic, many are +concerned by the low-efficiency in the remote working. Missing in text-based +communication are non-verbal cues such as facial expressions and body language, +which hinders the effective communication and negatively impacts the work +outcomes. Prevalent on social media platforms, emojis, as alternative +non-verbal cues, are gaining popularity in the virtual workspaces well. In this +paper, we study how emoji usage influences developer participation and issue +resolution in virtual workspaces. To this end, we collect GitHub issues for a +one-year period and apply causal inference techniques to measure the causal +effect of emojis on the outcome of issues, controlling for confounders such as +issue content, repository, and author information. We find that emojis can +significantly reduce the resolution time of issues and attract more user +participation. We also compare the heterogeneous effect on different types of +issues. These findings deepen our understanding of the developer communities, +and they provide design implications on how to facilitate interactions and +broaden developer participation. + +
+
+ comment: 12 pages, 5 figures. To be published in the 18th International AAAI + Conference on Web and Social Media (ICWSM 2024) +
+
+
+
+
+ + ♻ ☆ LDMRes-Net: Enabling Efficient Medical Image Segmentation on IoT and + Edge Platforms + + +
+ In this study, we propose LDMRes-Net, a lightweight dual-multiscale residual +block-based computational neural network tailored for medical image +segmentation on IoT and edge platforms. Conventional U-Net-based models face +challenges in meeting the speed and efficiency demands of real-time clinical +applications, such as disease monitoring, radiation therapy, and image-guided +surgery. LDMRes-Net overcomes these limitations with its remarkably low number +of learnable parameters (0.072M), making it highly suitable for +resource-constrained devices. The model's key innovation lies in its dual +multi-residual block architecture, which enables the extraction of refined +features on multiple scales, enhancing overall segmentation performance. To +further optimize efficiency, the number of filters is carefully selected to +prevent overlap, reduce training time, and improve computational efficiency. +The study includes comprehensive evaluations, focusing on segmentation of the +retinal image of vessels and hard exudates crucial for the diagnosis and +treatment of ophthalmology. The results demonstrate the robustness, +generalizability, and high segmentation accuracy of LDMRes-Net, positioning it +as an efficient tool for accurate and rapid medical image segmentation in +diverse clinical applications, particularly on IoT and edge platforms. Such +advances hold significant promise for improving healthcare outcomes and +enabling real-time medical image analysis in resource-limited settings. + +
+
+
+
+
+ + ♻ ☆ A Majority Invariant Approach to Patch Robustness Certification for Deep + Learning Models + + +
+ Patch robustness certification ensures no patch within a given bound on a +sample can manipulate a deep learning model to predict a different label. +However, existing techniques cannot certify samples that cannot meet their +strict bars at the classifier or patch region levels. This paper proposes +MajorCert. MajorCert firstly finds all possible label sets manipulatable by the +same patch region on the same sample across the underlying classifiers, then +enumerates their combinations element-wise, and finally checks whether the +majority invariant of all these combinations is intact to certify samples. + +
+
+ comment: 5 pages, 2 figures, accepted for inclusion in the ASE 2023 NIER track +
+
+
+
+
+ + ♻ ☆ VLUCI: Variational Learning of Unobserved Confounders for Counterfactual + Inference + + +
+ Causal inference plays a vital role in diverse domains like epidemiology, +healthcare, and economics. De-confounding and counterfactual prediction in +observational data has emerged as a prominent concern in causal inference +research. While existing models tackle observed confounders, the presence of +unobserved confounders remains a significant challenge, distorting causal +inference and impacting counterfactual outcome accuracy. To address this, we +propose a novel variational learning model of unobserved confounders for +counterfactual inference (VLUCI), which generates the posterior distribution of +unobserved confounders. VLUCI relaxes the unconfoundedness assumption often +overlooked by most causal inference methods. By disentangling observed and +unobserved confounders, VLUCI constructs a doubly variational inference model +to approximate the distribution of unobserved confounders, which are used for +inferring more accurate counterfactual outcomes. Extensive experiments on +synthetic and semi-synthetic datasets demonstrate VLUCI's superior performance +in inferring unobserved confounders. It is compatible with state-of-the-art +counterfactual inference models, significantly improving inference accuracy at +both group and individual levels. Additionally, VLUCI provides confidence +intervals for counterfactual outcomes, aiding decision-making in risk-sensitive +domains. We further clarify the considerations when applying VLUCI to cases +where unobserved confounders don't strictly conform to our model assumptions +using the public IHDP dataset as an example, highlighting the practical +advantages of VLUCI. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ AtmoRep: A stochastic model of atmosphere dynamics using large scale + representation learning + + +
+ The atmosphere affects humans in a multitude of ways, from loss of life due +to adverse weather effects to long-term social and economic impacts on +societies. Computer simulations of atmospheric dynamics are, therefore, of +great importance for the well-being of our and future generations. Here, we +propose AtmoRep, a novel, task-independent stochastic computer model of +atmospheric dynamics that can provide skillful results for a wide range of +applications. AtmoRep uses large-scale representation learning from artificial +intelligence to determine a general description of the highly complex, +stochastic dynamics of the atmosphere from the best available estimate of the +system's historical trajectory as constrained by observations. This is enabled +by a novel self-supervised learning objective and a unique ensemble that +samples from the stochastic model with a variability informed by the one in the +historical record. The task-independent nature of AtmoRep enables skillful +results for a diverse set of applications without specifically training for +them and we demonstrate this for nowcasting, temporal interpolation, model +correction, and counterfactuals. We also show that AtmoRep can be improved with +additional data, for example radar observations, and that it can be extended to +tasks such as downscaling. Our work establishes that large-scale neural +networks can provide skillful, task-independent models of atmospheric dynamics. +With this, they provide a novel means to make the large record of atmospheric +observations accessible for applications and for scientific inquiry, +complementing existing simulations based on first principles. + +
+
+
+
+
+ + ♻ ☆ Learning to Taste: A Multimodal Wine Dataset + + +
+ We present WineSensed, a large multimodal wine dataset for studying the +relations between visual perception, language, and flavor. The dataset +encompasses 897k images of wine labels and 824k reviews of wines curated from +the Vivino platform. It has over 350k unique vintages, annotated with year, +region, rating, alcohol percentage, price, and grape composition. We obtained +fine-grained flavor annotations on a subset by conducting a wine-tasting +experiment with 256 participants who were asked to rank wines based on their +similarity in flavor, resulting in more than 5k pairwise flavor distances. We +propose a low-dimensional concept embedding algorithm that combines human +experience with automatic machine similarity kernels. We demonstrate that this +shared concept embedding space improves upon separate embedding spaces for +coarse flavor classification (alcohol percentage, country, grape, price, +rating) and aligns with the intricate human perception of flavor. + +
+
+ comment: Corrected a typo in author name +
+
+
+
+
+ + ♻ ☆ ReFit: A Framework for Refinement of Weakly Supervised Semantic + Segmentation using Object Border Fitting for Medical Images + + +
+ Weakly Supervised Semantic Segmentation (WSSS) relying only on image-level +supervision is a promising approach to deal with the need for Segmentation +networks, especially for generating a large number of pixel-wise masks in a +given dataset. However, most state-of-the-art image-level WSSS techniques lack +an understanding of the geometric features embedded in the images since the +network cannot derive any object boundary information from just image-level +labels. We define a boundary here as the line separating an object and its +background, or two different objects. To address this drawback, we are +proposing our novel ReFit framework, which deploys state-of-the-art class +activation maps combined with various post-processing techniques in order to +achieve fine-grained higher-accuracy segmentation masks. To achieve this, we +investigate a state-of-the-art unsupervised segmentation network that can be +used to construct a boundary map, which enables ReFit to predict object +locations with sharper boundaries. By applying our method to WSSS predictions, +we achieved up to 10% improvement over the current state-of-the-art WSSS +methods for medical imaging. The framework is open-source, to ensure that our +results are reproducible, and accessible online at +https://github.com/bharathprabakaran/ReFit. + +
+
+ comment: Accepted for Publication at the International Symposium on Visual + Computing (ISVC), October 2023, Lake Tahoe, NV, USA +
+
+
+
+
+ + ♻ ☆ Blended-NeRF: Zero-Shot Object Generation and Blending in Existing + Neural Radiance Fields + + +
+ Editing a local region or a specific object in a 3D scene represented by a +NeRF or consistently blending a new realistic object into the scene is +challenging, mainly due to the implicit nature of the scene representation. We +present Blended-NeRF, a robust and flexible framework for editing a specific +region of interest in an existing NeRF scene, based on text prompts, along with +a 3D ROI box. Our method leverages a pretrained language-image model to steer +the synthesis towards a user-provided text prompt, along with a 3D MLP model +initialized on an existing NeRF scene to generate the object and blend it into +a specified region in the original scene. We allow local editing by localizing +a 3D ROI box in the input scene, and blend the content synthesized inside the +ROI with the existing scene using a novel volumetric blending technique. To +obtain natural looking and view-consistent results, we leverage existing and +new geometric priors and 3D augmentations for improving the visual fidelity of +the final result. We test our framework both qualitatively and quantitatively +on a variety of real 3D scenes and text prompts, demonstrating realistic +multi-view consistent results with much flexibility and diversity compared to +the baselines. Finally, we show the applicability of our framework for several +3D editing applications, including adding new objects to a scene, +removing/replacing/altering existing objects, and texture conversion. + +
+
+ comment: 16 pages, 14 figures. Project page: + https://www.vision.huji.ac.il/blended-nerf/ +
+
+
+
+
+ + ♻ ☆ DiFaReli: Diffusion Face Relighting ICCV 2023 + + +
+ We present a novel approach to single-view face relighting in the wild. +Handling non-diffuse effects, such as global illumination or cast shadows, has +long been a challenge in face relighting. Prior work often assumes Lambertian +surfaces, simplified lighting models or involves estimating 3D shape, albedo, +or a shadow map. This estimation, however, is error-prone and requires many +training examples with lighting ground truth to generalize well. Our work +bypasses the need for accurate estimation of intrinsic components and can be +trained solely on 2D images without any light stage data, multi-view images, or +lighting ground truth. Our key idea is to leverage a conditional diffusion +implicit model (DDIM) for decoding a disentangled light encoding along with +other encodings related to 3D shape and facial identity inferred from +off-the-shelf estimators. We also propose a novel conditioning technique that +eases the modeling of the complex interaction between light and geometry by +using a rendered shading reference to spatially modulate the DDIM. We achieve +state-of-the-art performance on standard benchmark Multi-PIE and can +photorealistically relight in-the-wild images. Please visit our page: +https://diffusion-face-relighting.github.io + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ M3FGM:a node masking and multi-granularity message passing-based + federated graph model for spatial-temporal data prediction ICONIP2023 + + +
+ Researchers are solving the challenges of spatial-temporal prediction by +combining Federated Learning (FL) and graph models with respect to the +constrain of privacy and security. In order to make better use of the power of +graph model, some researchs also combine split learning(SL). However, there are +still several issues left unattended: 1) Clients might not be able to access +the server during inference phase; 2) The graph of clients designed manually in +the server model may not reveal the proper relationship between clients. This +paper proposes a new GNN-oriented split federated learning method, named node +{\bfseries M}asking and {\bfseries M}ulti-granularity {\bfseries M}essage +passing-based Federated Graph Model (M$^3$FGM) for the above issues. For the +first issue, the server model of M$^3$FGM employs a MaskNode layer to simulate +the case of clients being offline. We also redesign the decoder of the client +model using a dual-sub-decoders structure so that each client model can use its +local data to predict independently when offline. As for the second issue, a +new GNN layer named Multi-Granularity Message Passing (MGMP) layer enables each +client node to perceive global and local information. We conducted extensive +experiments in two different scenarios on two real traffic datasets. Results +show that M$^3$FGM outperforms the baselines and variant models, achieves the +best results in both datasets and scenarios. + +
+
+ comment: Accepted by ICONIP2023 +
+
+
+
+
+ + ♻ ☆ Proper Learning of Linear Dynamical Systems as a Non-Commutative + Polynomial Optimisation Problem + + +
+ There has been much recent progress in forecasting the next observation of a +linear dynamical system (LDS), which is known as the improper learning, as well +as in the estimation of its system matrices, which is known as the proper +learning of LDS. We present an approach to proper learning of LDS, which in +spite of the non-convexity of the problem, guarantees global convergence of +numerical solutions to a least-squares estimator. We present promising +computational results. + +
+
+ comment: 14 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Blink: Link Local Differential Privacy in Graph Neural Networks via + Bayesian Estimation CCS 2023 + + +
+ Graph neural networks (GNNs) have gained an increasing amount of popularity +due to their superior capability in learning node embeddings for various graph +inference tasks, but training them can raise privacy concerns. To address this, +we propose using link local differential privacy over decentralized nodes, +enabling collaboration with an untrusted server to train GNNs without revealing +the existence of any link. Our approach spends the privacy budget separately on +links and degrees of the graph for the server to better denoise the graph +topology using Bayesian estimation, alleviating the negative impact of LDP on +the accuracy of the trained GNNs. We bound the mean absolute error of the +inferred link probabilities against the ground truth graph topology. We then +propose two variants of our LDP mechanism complementing each other in different +privacy settings, one of which estimates fewer links under lower privacy +budgets to avoid false positive link estimates when the uncertainty is high, +while the other utilizes more information and performs better given relatively +higher privacy budgets. Furthermore, we propose a hybrid variant that combines +both strategies and is able to perform better across different privacy budgets. +Extensive experiments show that our approach outperforms existing methods in +terms of accuracy under varying privacy budgets. + +
+
+ comment: 17 pages, accepted by ACM CCS 2023 as a conference paper +
+
+
+
+
+ + ♻ ☆ Dynamic Causal Graph Convolutional Network for Traffic Prediction + + +
+ Modeling complex spatiotemporal dependencies in correlated traffic series is +essential for traffic prediction. While recent works have shown improved +prediction performance by using neural networks to extract spatiotemporal +correlations, their effectiveness depends on the quality of the graph +structures used to represent the spatial topology of the traffic network. In +this work, we propose a novel approach for traffic prediction that embeds +time-varying dynamic Bayesian network to capture the fine spatiotemporal +topology of traffic data. We then use graph convolutional networks to generate +traffic forecasts. To enable our method to efficiently model nonlinear traffic +propagation patterns, we develop a deep learning-based module as a +hyper-network to generate stepwise dynamic causal graphs. Our experimental +results on a real traffic dataset demonstrate the superior prediction +performance of the proposed method. The code is available at +https://github.com/MonBG/DCGCN. + +
+
+ comment: Accepted to IEEE CASE 2023; Peter Luh Best Memorial Award for Young + Researcher (Finalist) +
+
+
+
+
+ + ♻ ☆ RAHNet: Retrieval Augmented Hybrid Network for Long-tailed Graph + Classification + + +
+ Graph classification is a crucial task in many real-world multimedia +applications, where graphs can represent various multimedia data types such as +images, videos, and social networks. Previous efforts have applied graph neural +networks (GNNs) in balanced situations where the class distribution is +balanced. However, real-world data typically exhibit long-tailed class +distributions, resulting in a bias towards the head classes when using GNNs and +limited generalization ability over the tail classes. Recent approaches mainly +focus on re-balancing different classes during model training, which fails to +explicitly introduce new knowledge and sacrifices the performance of the head +classes. To address these drawbacks, we propose a novel framework called +Retrieval Augmented Hybrid Network (RAHNet) to jointly learn a robust feature +extractor and an unbiased classifier in a decoupled manner. In the feature +extractor training stage, we develop a graph retrieval module to search for +relevant graphs that directly enrich the intra-class diversity for the tail +classes. Moreover, we innovatively optimize a category-centered supervised +contrastive loss to obtain discriminative representations, which is more +suitable for long-tailed scenarios. In the classifier fine-tuning stage, we +balance the classifier weights with two weight regularization techniques, i.e., +Max-norm and weight decay. Experiments on various popular benchmarks verify the +superiority of the proposed method against state-of-the-art approaches. + +
+
+ comment: Accepted by the ACM International Conference on Multimedia (MM) 2023 +
+
+
+
+
+ + ♻ ☆ Pure Exploration in Bandits with Linear Constraints + + +
+ We address the problem of identifying the optimal policy with a fixed +confidence level in a multi-armed bandit setup, when \emph{the arms are subject +to linear constraints}. Unlike the standard best-arm identification problem +which is well studied, the optimal policy in this case may not be deterministic +and could mix between several arms. This changes the geometry of the problem +which we characterize via an information-theoretic lower bound. We introduce +two asymptotically optimal algorithms for this setting, one based on the +Track-and-Stop method and the other based on a game-theoretic approach. Both +these algorithms try to track an optimal allocation based on the lower bound +and computed by a weighted projection onto the boundary of a normal cone. +Finally, we provide empirical results that validate our bounds and visualize +how constraints change the hardness of the problem. + +
+
+ comment: EWRL16 +
+
+
+
+
+ + ♻ ☆ Accelerating Numerical Solvers for Large-Scale Simulation of Dynamical + System via NeurVec + + +
+ The large-scale simulation of dynamical systems is critical in numerous +scientific and engineering disciplines. However, traditional numerical solvers +are limited by the choice of step sizes when estimating integration, resulting +in a trade-off between accuracy and computational efficiency. To address this +challenge, we introduce a deep learning-based corrector called Neural Vector +(NeurVec), which can compensate for integration errors and enable larger time +step sizes in simulations. Our extensive experiments on a variety of complex +dynamical system benchmarks demonstrate that NeurVec exhibits remarkable +generalization capability on a continuous phase space, even when trained using +limited and discrete data. NeurVec significantly accelerates traditional +solvers, achieving speeds tens to hundreds of times faster while maintaining +high levels of accuracy and stability. Moreover, NeurVec's simple-yet-effective +design, combined with its ease of implementation, has the potential to +establish a new paradigm for fast-solving differential equations based on deep +learning. + +
+
+ comment: Accepted by Scientific Report +
+
+
+
+
+ + ♻ ☆ Adversarial Likelihood Estimation With One-Way Flows + + +
+ Generative Adversarial Networks (GANs) can produce high-quality samples, but +do not provide an estimate of the probability density around the samples. +However, it has been noted that maximizing the log-likelihood within an +energy-based setting can lead to an adversarial framework where the +discriminator provides unnormalized density (often called energy). We further +develop this perspective, incorporate importance sampling, and show that 1) +Wasserstein GAN performs a biased estimate of the partition function, and we +propose instead to use an unbiased estimator; and 2) when optimizing for +likelihood, one must maximize generator entropy. This is hypothesized to +provide a better mode coverage. Different from previous works, we explicitly +compute the density of the generated samples. This is the key enabler to +designing an unbiased estimator of the partition function and computation of +the generator entropy term. The generator density is obtained via a new type of +flow network, called one-way flow network, that is less constrained in terms of +architecture, as it does not require a tractable inverse function. Our +experimental results show that our method converges faster, produces comparable +sample quality to GANs with similar architecture, successfully avoids +over-fitting to commonly used datasets and produces smooth low-dimensional +latent representations of the training data. + +
+
+
+
+
+ + ♻ ☆ RatGPT: Turning online LLMs into Proxies for Malware Attacks + + +
+ The evolution of Generative AI and the capabilities of the newly released +Large Language Models (LLMs) open new opportunities in software engineering. +However, they also lead to new challenges in cybersecurity. Recently, +researchers have shown the possibilities of using LLMs such as ChatGPT to +generate malicious content that can directly be exploited or guide +inexperienced hackers to weaponize tools and code. These studies covered +scenarios that still require the attacker to be in the middle of the loop. In +this study, we leverage openly available plugins and use an LLM as proxy +between the attacker and the victim. We deliver a proof-of-concept where +ChatGPT is used for the dissemination of malicious software while evading +detection, alongside establishing the communication to a command and control +(C2) server to receive commands to interact with a victim's system. Finally, we +present the general approach as well as essential elements in order to stay +undetected and make the attack a success. This proof-of-concept highlights +significant cybersecurity issues with openly available plugins and LLMs, which +require the development of security guidelines, controls, and mitigation +strategies. + +
+
+
+
+
+ + ♻ ☆ Unlearnable Examples Give a False Sense of Security: Piercing through + Unexploitable Data with Learnable Examples + + +
+ Safeguarding data from unauthorized exploitation is vital for privacy and +security, especially in recent rampant research in security breach such as +adversarial/membership attacks. To this end, \textit{unlearnable examples} +(UEs) have been recently proposed as a compelling protection, by adding +imperceptible perturbation to data so that models trained on them cannot +classify them accurately on original clean distribution. Unfortunately, we find +UEs provide a false sense of security, because they cannot stop unauthorized +users from utilizing other unprotected data to remove the protection, by +turning unlearnable data into learnable again. Motivated by this observation, +we formally define a new threat by introducing \textit{learnable unauthorized +examples} (LEs) which are UEs with their protection removed. The core of this +approach is a novel purification process that projects UEs onto the manifold of +LEs. This is realized by a new joint-conditional diffusion model which denoises +UEs conditioned on the pixel and perceptual similarity between UEs and LEs. +Extensive experiments demonstrate that LE delivers state-of-the-art countering +performance against both supervised UEs and unsupervised UEs in various +scenarios, which is the first generalizable countermeasure to UEs across +supervised learning and unsupervised learning. Our code is available at +\url{https://github.com/jiangw-0/LE_JCDP}. + +
+
+ comment: Accepted in MM 2023 +
+
+
+
+
+ + ♻ ☆ Mixup-Augmented Meta-Learning for Sample-Efficient Fine-Tuning of + Protein Simulators + + +
+ Molecular dynamics simulations have emerged as a fundamental instrument for +studying biomolecules. At the same time, it is desirable to perform simulations +of a collection of particles under various conditions in which the molecules +can fluctuate. In this paper, we explore and adapt the soft prompt-based +learning method to molecular dynamics tasks. Our model can remarkably +generalize to unseen and out-of-distribution scenarios with limited training +data. While our work focuses on temperature as a test case, the versatility of +our approach allows for efficient simulation through any continuous dynamic +conditions, such as pressure and volumes. Our framework has two stages: 1) +Pre-trains with data mixing technique, augments molecular structure data and +temperature prompts, then applies a curriculum learning method by increasing +the ratio of them smoothly. 2) Meta-learning-based fine-tuning framework +improves sample-efficiency of fine-tuning process and gives the soft +prompt-tuning better initialization points. Comprehensive experiments reveal +that our framework excels in accuracy for in-domain data and demonstrates +strong generalization capabilities for unseen and out-of-distribution samples. + +
+
+
+
+
+ + ♻ ☆ Take-A-Photo: 3D-to-2D Generative Pre-training of Point Cloud Models ICCV 2023 + + +
+ With the overwhelming trend of mask image modeling led by MAE, generative +pre-training has shown a remarkable potential to boost the performance of +fundamental models in 2D vision. However, in 3D vision, the over-reliance on +Transformer-based backbones and the unordered nature of point clouds have +restricted the further development of generative pre-training. In this paper, +we propose a novel 3D-to-2D generative pre-training method that is adaptable to +any point cloud model. We propose to generate view images from different +instructed poses via the cross-attention mechanism as the pre-training scheme. +Generating view images has more precise supervision than its point cloud +counterpart, thus assisting 3D backbones to have a finer comprehension of the +geometrical structure and stereoscopic relations of the point cloud. +Experimental results have proved the superiority of our proposed 3D-to-2D +generative pre-training over previous pre-training methods. Our method is also +effective in boosting the performance of architecture-oriented approaches, +achieving state-of-the-art performance when fine-tuning on ScanObjectNN +classification and ShapeNetPart segmentation tasks. Code is available at +https://github.com/wangzy22/TAP. + +
+
+ comment: Accepted to ICCV 2023, project page: https://tap.ivg-research.xyz +
+
+
+
+
+ + ♻ ☆ On Root Cause Localization and Anomaly Mitigation through Causal + Inference + + +
+ Due to a wide spectrum of applications in the real world, such as security, +financial surveillance, and health risk, various deep anomaly detection models +have been proposed and achieved state-of-the-art performance. However, besides +being effective, in practice, the practitioners would further like to know what +causes the abnormal outcome and how to further fix it. In this work, we propose +RootCLAM, which aims to achieve Root Cause Localization and Anomaly Mitigation +from a causal perspective. Especially, we formulate anomalies caused by +external interventions on the normal causal mechanism and aim to locate the +abnormal features with external interventions as root causes. After that, we +further propose an anomaly mitigation approach that aims to recommend +mitigation actions on abnormal features to revert the abnormal outcomes such +that the counterfactuals guided by the causal mechanism are normal. Experiments +on three datasets show that our approach can locate the root causes and further +flip the abnormal labels. + +
+
+
+
+
+ + ♻ ☆ A comparison of rational and neural network based approximations + + +
+ Rational and neural network based approximations are efficient tools in +modern approximation. These approaches are able to produce accurate +approximations to nonsmooth and non-Lipschitz functions, including multivariate +domain functions. In this paper we compare the efficiency of function +approximation using rational approximation, neural network and their +combinations. It was found that rational approximation is superior to neural +network based approaches with the same number of decision variables. Our +numerical experiments demonstrate the efficiency of rational approximation, +even when the number of approximation parameters (that is, the dimension of the +corresponding optimisation problems) is small. Another important contribution +of this paper lies in the improvement of rational approximation algorithms. +Namely, the optimisation based algorithms for rational approximation can be +adjusted to in such a way that the conditioning number of the constraint +matrices are controlled. This simple adjustment enables us to work with high +dimension optimisation problems and improve the design of the neural network. +The main strength of neural networks is in their ability to handle models with +a large number of variables: complex models are decomposed in several simple +optimisation problems. Therefore the the large number of decision variables is +in the nature of neural networks. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ♻ ☆ Impression-Informed Multi-Behavior Recommender System: A Hierarchical + Graph Attention Approach + + +
+ While recommender systems have significantly benefited from implicit +feedback, they have often missed the nuances of multi-behavior interactions +between users and items. Historically, these systems either amalgamated all +behaviors, such as \textit{impression} (formerly \textit{view}), +\textit{add-to-cart}, and \textit{buy}, under a singular 'interaction' label, +or prioritized only the target behavior, often the \textit{buy} action, +discarding valuable auxiliary signals. Although recent advancements tried +addressing this simplification, they primarily gravitated towards optimizing +the target behavior alone, battling with data scarcity. Additionally, they +tended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these +gaps, we introduce the \textbf{H}ierarchical \textbf{M}ulti-behavior +\textbf{G}raph Attention \textbf{N}etwork (HMGN). This pioneering framework +leverages attention mechanisms to discern information from both inter and +intra-behaviors while employing a multi-task Hierarchical Bayesian Personalized +Ranking (HBPR) for optimization. Recognizing the need for scalability, our +approach integrates a specialized multi-behavior sub-graph sampling technique. +Moreover, the adaptability of HMGN allows for the seamless inclusion of +knowledge metadata and time-series data. Empirical results attest to our +model's prowess, registering a notable performance boost of up to 64\% in +NDCG@100 metrics over conventional graph neural network methods. + +
+
+
+
+
+ + ♻ ☆ Acoustic-to-articulatory inversion for dysarthric speech: Are + pre-trained self-supervised representations favorable? ICASSP 2024 + + +
+ $ $Acoustic-to-articulatory inversion (AAI) involves mapping from the +acoustic space to the articulatory space. Signal-processing features like the +MFCCs, have been widely used for the AAI task. For subjects with dysarthric +speech, AAI is challenging because of an imprecise and indistinct +pronunciation. In this work, we perform AAI for dysarthric speech using +representations from pre-trained self-supervised learning (SSL) models. We +demonstrate the impact of different pre-trained features on this challenging +AAI task, at low-resource conditions. In addition, we also condition x-vectors +to the extracted SSL features to train a BLSTM network. In the seen case, we +experiment with three AAI training schemes (subject-specific, pooled, and +fine-tuned). The results, consistent across training schemes, reveal that +DeCoAR, in the fine-tuned scheme, achieves a relative improvement of the +Pearson Correlation Coefficient (CC) by ${\sim}$1.81\% and ${\sim}$4.56\% for +healthy controls and patients, respectively, over MFCCs. In the unseen case, we +observe similar average trends for different SSL features. Overall, SSL +networks like wav2vec, APC, and DeCoAR, which are trained with feature +reconstruction or future timestep prediction tasks, perform well in predicting +dysarthric articulatory trajectories. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Achieving Occam's Razor: Deep Learning for Optimal Model Reduction + + +
+ All fields of science depend on mathematical models. Occam's razor refers to +the principle that good models should exclude parameters beyond those minimally +required to describe the systems they represent. This is because redundancy can +lead to incorrect estimates of model parameters from data, and thus inaccurate +or ambiguous conclusions. Here, we show how deep learning can be powerfully +leveraged to address Occam's razor. FixFit, our new method, uses a feedforward +deep neural network with a bottleneck layer to characterize and predict the +behavior of a given model from its input parameters. FixFit has three major +benefits. First, it provides a metric to quantify the original model's degree +of complexity. Second, it allows for the unique fitting of data. Third, it +provides an unbiased way to discriminate between experimental hypotheses that +add value versus those that do not. In two use cases, we demonstrate the broad +applicability of this method across scientific domains. To validate the method +using a known system, we apply FixFit to recover known composite parameters for +the Kepler orbit model. To illustrate how the method can be applied to less +well-established fields, we use it to identify parameters for a multi-scale +brain model and reduce the search space for viable candidate mechanisms. + +
+
+
+
+
+ + ♻ ☆ Limitation of Characterizing Implicit Regularization by Data-independent + Functions + + +
+ In recent years, understanding the implicit regularization of neural networks +(NNs) has become a central task in deep learning theory. However, implicit +regularization is itself not completely defined and well understood. In this +work, we attempt to mathematically define and study implicit regularization. +Importantly, we explore the limitations of a common approach to characterizing +implicit regularization using data-independent functions. We propose two +dynamical mechanisms, i.e., Two-point and One-point Overlapping mechanisms, +based on which we provide two recipes for producing classes of +one-hidden-neuron NNs that provably cannot be fully characterized by a type of +or all data-independent functions. Following the previous works, our results +further emphasize the profound data dependency of implicit regularization in +general, inspiring us to study in detail the data dependency of NN implicit +regularization in the future. + +
+
+ comment: Revised the structure of paper and added results about implicit + regularization in training two-layer network or even more general "activation + function-related" models +
+
+
+
+
+ + ♻ ☆ Towards provably efficient quantum algorithms for large-scale + machine-learning models + + +
+ Large machine learning models are revolutionary technologies of artificial +intelligence whose bottlenecks include huge computational expenses, power, and +time used both in the pre-training and fine-tuning process. In this work, we +show that fault-tolerant quantum computing could possibly provide provably +efficient resolutions for generic (stochastic) gradient descent algorithms, +scaling as $\mathcal{O}(T^2 \times \text{polylog}(n))$, where $n$ is the size +of the models and $T$ is the number of iterations in the training, as long as +the models are both sufficiently dissipative and sparse, with small learning +rates. Based on earlier efficient quantum algorithms for dissipative +differential equations, we find and prove that similar algorithms work for +(stochastic) gradient descent, the primary algorithm for machine learning. In +practice, we benchmark instances of large machine learning models from 7 +million to 103 million parameters. We find that, in the context of sparse +training, a quantum enhancement is possible at the early stage of learning +after model pruning, motivating a sparse parameter download and re-upload +scheme. Our work shows solidly that fault-tolerant quantum algorithms could +potentially contribute to most state-of-the-art, large-scale machine-learning +problems. + +
+
+ comment: 6+39 pages, 3+10 figures, substantial detail added +
+
+
+
+
+ + ♻ ☆ Internet Explorer: Targeted Representation Learning on the Open Web ICML 2023 + + +
+ Modern vision models typically rely on fine-tuning general-purpose models +pre-trained on large, static datasets. These general-purpose models only +capture the knowledge within their pre-training datasets, which are tiny, +out-of-date snapshots of the Internet -- where billions of images are uploaded +each day. We suggest an alternate approach: rather than hoping our static +datasets transfer to our desired tasks after large-scale pre-training, we +propose dynamically utilizing the Internet to quickly train a small-scale model +that does extremely well on the task at hand. Our approach, called Internet +Explorer, explores the web in a self-supervised manner to progressively find +relevant examples that improve performance on a desired target dataset. It +cycles between searching for images on the Internet with text queries, +self-supervised training on downloaded images, determining which images were +useful, and prioritizing what to search for next. We evaluate Internet Explorer +across several datasets and show that it outperforms or matches CLIP oracle +performance by using just a single GPU desktop to actively query the Internet +for 30--40 hours. Results, visualizations, and videos at +https://internet-explorer-ssl.github.io/ + +
+
+ comment: In ICML 2023. Website at https://internet-explorer-ssl.github.io/ +
+
+
+
+
+ + ♻ ☆ Domain Adaptation for Efficiently Fine-tuning Vision Transformer with + Encrypted Images + + +
+ In recent years, deep neural networks (DNNs) trained with transformed data +have been applied to various applications such as privacy-preserving learning, +access control, and adversarial defenses. However, the use of transformed data +decreases the performance of models. Accordingly, in this paper, we propose a +novel method for fine-tuning models with transformed images under the use of +the vision transformer (ViT). The proposed domain adaptation method does not +cause the accuracy degradation of models, and it is carried out on the basis of +the embedding structure of ViT. In experiments, we confirmed that the proposed +method prevents accuracy degradation even when using encrypted images with the +CIFAR-10 and CIFAR-100 datasets. + +
+
+ comment: Accepted by APSIPA 2023 +
+
+
+
+
+ + ♻ ☆ Diffusion-EDFs: Bi-equivariant Denoising Generative Modeling on SE(3) + for Visual Robotic Manipulation + + +
+ Recent studies have verified that equivariant methods can significantly +improve the data efficiency, generalizability, and robustness in robot +learning. Meanwhile, denoising diffusion-based generative modeling has recently +gained significant attention as a promising approach for robotic manipulation +learning from demonstrations with stochastic behaviors. In this paper, we +present Diffusion-EDFs, a novel approach that incorporates spatial +roto-translation equivariance, i.e., SE(3)-equivariance to diffusion generative +modeling. By integrating SE(3)-equivariance into our model architectures, we +demonstrate that our proposed method exhibits remarkable data efficiency, +requiring only 5 to 10 task demonstrations for effective end-to-end training. +Furthermore, our approach showcases superior generalizability compared to +previous diffusion-based manipulation methods. + +
+
+ comment: 27 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Efficient anti-symmetrization of a neural network layer by taming the + sign problem + + +
+ Explicit antisymmetrization of a neural network is a potential candidate for +a universal function approximator for generic antisymmetric functions, which +are ubiquitous in quantum physics. However, this procedure is a priori +factorially costly to implement, making it impractical for large numbers of +particles. The strategy also suffers from a sign problem. Namely, due to +near-exact cancellation of positive and negative contributions, the magnitude +of the antisymmetrized function may be significantly smaller than before +anti-symmetrization. We show that the anti-symmetric projection of a two-layer +neural network can be evaluated efficiently, opening the door to using a +generic antisymmetric layer as a building block in anti-symmetric neural +network Ansatzes. This approximation is effective when the sign problem is +controlled, and we show that this property depends crucially the choice of +activation function under standard Xavier/He initialization methods. As a +consequence, using a smooth activation function requires re-scaling of the +neural network weights compared to standard initializations. + +
+
+ comment: To appear in JML, ISSN: 2790-2048(e), 2790-203X(p) +
+
+
+
+
+ + ♻ ☆ Deep Network Approximation: Beyond ReLU to Diverse Activation Functions + + +
+ This paper explores the expressive power of deep neural networks for a +diverse range of activation functions. An activation function set $\mathscr{A}$ +is defined to encompass the majority of commonly used activation functions, +such as $\mathtt{ReLU}$, $\mathtt{LeakyReLU}$, $\mathtt{ReLU}^2$, +$\mathtt{ELU}$, $\mathtt{SELU}$, $\mathtt{Softplus}$, $\mathtt{GELU}$, +$\mathtt{SiLU}$, $\mathtt{Swish}$, $\mathtt{Mish}$, $\mathtt{Sigmoid}$, +$\mathtt{Tanh}$, $\mathtt{Arctan}$, $\mathtt{Softsign}$, $\mathtt{dSiLU}$, and +$\mathtt{SRS}$. We demonstrate that for any activation function $\varrho\in +\mathscr{A}$, a $\mathtt{ReLU}$ network of width $N$ and depth $L$ can be +approximated to arbitrary precision by a $\varrho$-activated network of width +$4N$ and depth $2L$ on any bounded set. This finding enables the extension of +most approximation results achieved with $\mathtt{ReLU}$ networks to a wide +variety of other activation functions, at the cost of slightly larger +constants. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+ + ☆ ArtHDR-Net: Perceptually Realistic and Accurate HDR Content Creation SC + + +
+ High Dynamic Range (HDR) content creation has become an important topic for +modern media and entertainment sectors, gaming and Augmented/Virtual Reality +industries. Many methods have been proposed to recreate the HDR counterparts of +input Low Dynamic Range (LDR) images/videos given a single exposure or +multi-exposure LDRs. The state-of-the-art methods focus primarily on the +preservation of the reconstruction's structural similarity and the pixel-wise +accuracy. However, these conventional approaches do not emphasize preserving +the artistic intent of the images in terms of human visual perception, which is +an essential element in media, entertainment and gaming. In this paper, we +attempt to study and fill this gap. We propose an architecture called +ArtHDR-Net based on a Convolutional Neural Network that uses multi-exposed LDR +features as input. Experimental results show that ArtHDR-Net can achieve +state-of-the-art performance in terms of the HDR-VDP-2 score (i.e., mean +opinion score index) while reaching competitive performance in terms of PSNR +and SSIM. + +
+
+ comment: Accepted in Asia Pacific Signal and Information Processing + Association Annual Summit and Conference (APSIPA ASC), Taipei, Taiwan +
+
+
+
+
+ + ☆ T2IW: Joint Text to Image & Watermark Generation + + +
+ Recent developments in text-conditioned image generative models have +revolutionized the production of realistic results. Unfortunately, this has +also led to an increase in privacy violations and the spread of false +information, which requires the need for traceability, privacy protection, and +other security measures. However, existing text-to-image paradigms lack the +technical capabilities to link traceable messages with image generation. In +this study, we introduce a novel task for the joint generation of text to image +and watermark (T2IW). This T2IW scheme ensures minimal damage to image quality +when generating a compound image by forcing the semantic feature and the +watermark signal to be compatible in pixels. Additionally, by utilizing +principles from Shannon information theory and non-cooperative game theory, we +are able to separate the revealed image and the revealed watermark from the +compound image. Furthermore, we strengthen the watermark robustness of our +approach by subjecting the compound image to various post-processing attacks, +with minimal pixel distortion observed in the revealed watermark. Extensive +experiments have demonstrated remarkable achievements in image quality, +watermark invisibility, and watermark robustness, supported by our proposed set +of evaluation metrics. + +
+
+
+
+
+ + ☆ Reuse and Diffuse: Iterative Denoising for Text-to-Video Generation + + +
+ Inspired by the remarkable success of Latent Diffusion Models (LDMs) for +image synthesis, we study LDM for text-to-video generation, which is a +formidable challenge due to the computational and memory constraints during +both model training and inference. A single LDM is usually only capable of +generating a very limited number of video frames. Some existing works focus on +separate prediction models for generating more video frames, which suffer from +additional training cost and frame-level jittering, however. In this paper, we +propose a framework called "Reuse and Diffuse" dubbed $\textit{VidRD}$ to +produce more frames following the frames already generated by an LDM. +Conditioned on an initial video clip with a small number of frames, additional +frames are iteratively generated by reusing the original latent features and +following the previous diffusion process. Besides, for the autoencoder used for +translation between pixel space and latent space, we inject temporal layers +into its decoder and fine-tune these layers for higher temporal consistency. We +also propose a set of strategies for composing video-text data that involve +diverse content from multiple existing datasets including video datasets for +action recognition and image-text datasets. Extensive experiments show that our +method achieves good results in both quantitative and qualitative evaluations. +Our project page is available +$\href{https://anonymous0x233.github.io/ReuseAndDiffuse/}{here}$. + +
+
+
+
+
+ + ☆ Zero-Shot Scene Graph Generation via Triplet Calibration and Reduction + + +
+ Scene Graph Generation (SGG) plays a pivotal role in downstream +vision-language tasks. Existing SGG methods typically suffer from poor +compositional generalizations on unseen triplets. They are generally trained on +incompletely annotated scene graphs that contain dominant triplets and tend to +bias toward these seen triplets during inference. To address this issue, we +propose a Triplet Calibration and Reduction (T-CAR) framework in this paper. In +our framework, a triplet calibration loss is first presented to regularize the +representations of diverse triplets and to simultaneously excavate the unseen +triplets in incompletely annotated training scene graphs. Moreover, the unseen +space of scene graphs is usually several times larger than the seen space since +it contains a huge number of unrealistic compositions. Thus, we propose an +unseen space reduction loss to shift the attention of excavation to reasonable +unseen compositions to facilitate the model training. Finally, we propose a +contextual encoder to improve the compositional generalizations of unseen +triplets by explicitly modeling the relative spatial relations between subjects +and objects. Extensive experiments show that our approach achieves consistent +improvements for zero-shot SGG over state-of-the-art methods. The code is +available at https://github.com/jkli1998/T-CAR. + +
+
+ comment: Accept in TOMM 2023 +
+
+
+
+
+ + ♻ ☆ Deep Video Codec Control + + +
+ Lossy video compression is commonly used when transmitting and storing video +data. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard, +despite the availability of advanced (neural) compression approaches. +Transmitting videos in the face of dynamic network bandwidth conditions +requires video codecs to adapt to vastly different compression strengths. Rate +control modules augment the codec's compression such that bandwidth constraints +are satisfied and video distortion is minimized. While, both standard video +codes and their rate control modules are developed to minimize video distortion +w.r.t. human quality assessment, preserving the downstream performance of deep +vision models is not considered. In this paper, we present the first end-to-end +learnable deep video codec control considering both bandwidth constraints and +downstream vision performance, while not breaking existing standardization. We +demonstrate for two common vision tasks (semantic segmentation and optical flow +estimation) and on two different datasets that our deep codec control better +preserves downstream performance than using 2-pass average bit rate control +while meeting dynamic bandwidth constraints and adhering to standardizations. + +
+
+ comment: 22 pages, 26 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ VideoGen: A Reference-Guided Latent Diffusion Approach for High + Definition Text-to-Video Generation + + +
+ In this paper, we present VideoGen, a text-to-video generation approach, +which can generate a high-definition video with high frame fidelity and strong +temporal consistency using reference-guided latent diffusion. We leverage an +off-the-shelf text-to-image generation model, e.g., Stable Diffusion, to +generate an image with high content quality from the text prompt, as a +reference image to guide video generation. Then, we introduce an efficient +cascaded latent diffusion module conditioned on both the reference image and +the text prompt, for generating latent video representations, followed by a +flow-based temporal upsampling step to improve the temporal resolution. +Finally, we map latent video representations into a high-definition video +through an enhanced video decoder. During training, we use the first frame of a +ground-truth video as the reference image for training the cascaded latent +diffusion module. The main characterises of our approach include: the reference +image generated by the text-to-image model improves the visual fidelity; using +it as the condition makes the diffusion model focus more on learning the video +dynamics; and the video decoder is trained over unlabeled video data, thus +benefiting from high-quality easily-available videos. VideoGen sets a new +state-of-the-art in text-to-video generation in terms of both qualitative and +quantitative evaluation. See \url{https://videogen.github.io/VideoGen/} for +more samples. + +
+
+ comment: 8pages, 8figures, project page: https://videogen.github.io/VideoGen/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 45 + +
+
+
+ + ☆ Gender-specific Machine Translation with Large Language Models + + +
+ Decoder-only Large Language Models (LLMs) have demonstrated potential in +machine translation (MT), albeit with performance slightly lagging behind +traditional encoder-decoder Neural Machine Translation (NMT) systems. However, +LLMs offer a unique advantage: the ability to control the properties of the +output through prompts. In this study, we harness this flexibility to explore +LLaMa's capability to produce gender-specific translations for languages with +grammatical gender. Our results indicate that LLaMa can generate +gender-specific translations with competitive accuracy and gender bias +mitigation when compared to NLLB, a state-of-the-art multilingual NMT system. +Furthermore, our experiments reveal that LLaMa's translations are robust, +showing significant performance drops when evaluated against opposite-gender +references in gender-ambiguous datasets but maintaining consistency in less +ambiguous contexts. This research provides insights into the potential and +challenges of using LLMs for gender-specific translations and highlights the +importance of in-context learning to elicit new tasks in LLMs. + +
+
+
+
+
+ + ☆ GPT-InvestAR: Enhancing Stock Investment Strategies through Annual + Report Analysis with Large Language Models + + +
+ Annual Reports of publicly listed companies contain vital information about +their financial health which can help assess the potential impact on Stock +price of the firm. These reports are comprehensive in nature, going up to, and +sometimes exceeding, 100 pages. Analysing these reports is cumbersome even for +a single firm, let alone the whole universe of firms that exist. Over the +years, financial experts have become proficient in extracting valuable +information from these documents relatively quickly. However, this requires +years of practice and experience. This paper aims to simplify the process of +assessing Annual Reports of all the firms by leveraging the capabilities of +Large Language Models (LLMs). The insights generated by the LLM are compiled in +a Quant styled dataset and augmented by historical stock price data. A Machine +Learning model is then trained with LLM outputs as features. The walkforward +test results show promising outperformance wrt S&P500 returns. This paper +intends to provide a framework for future work in this direction. To facilitate +this, the code has been released as open source. + +
+
+
+
+
+ + ☆ J-Guard: Journalism Guided Adversarially Robust Detection of + AI-generated News AACL 2023 + + +
+ The rapid proliferation of AI-generated text online is profoundly reshaping +the information landscape. Among various types of AI-generated text, +AI-generated news presents a significant threat as it can be a prominent source +of misinformation online. While several recent efforts have focused on +detecting AI-generated text in general, these methods require enhanced +reliability, given concerns about their vulnerability to simple adversarial +attacks. Furthermore, due to the eccentricities of news writing, applying these +detection methods for AI-generated news can produce false positives, +potentially damaging the reputation of news organizations. To address these +challenges, we leverage the expertise of an interdisciplinary team to develop a +framework, J-Guard, capable of steering existing supervised AI text detectors +for detecting AI-generated news while boosting adversarial robustness. By +incorporating stylistic cues inspired by the unique journalistic attributes, +J-Guard effectively distinguishes between real-world journalism and +AI-generated news articles. Our experiments on news articles generated by a +vast array of AI models, including ChatGPT (GPT3.5), demonstrate the +effectiveness of J-Guard in enhancing detection capabilities while maintaining +an average performance decrease of as low as 7% when faced with adversarial +attacks. + +
+
+ comment: This Paper is Accepted to The 13th International Joint Conference on + Natural Language Processing and the 3rd Conference of the Asia-Pacific + Chapter of the Association for Computational Linguistics (IJCNLP-AACL 2023) +
+
+
+
+
+ + ☆ Everyone Deserves A Reward: Learning Customized Human Preferences + + +
+ Reward models (RMs) are crucial in aligning large language models (LLMs) with +human preferences for improving interaction quality. However, the real world is +pluralistic, which leads to diversified human preferences based on different +religions, politics, cultures, etc. Moreover, each individual can have their +own unique preferences on various topics. Neglecting the diversity of human +preferences, current LLM training processes only use a general reward model, +which is below satisfaction for customized or personalized application +scenarios. To explore customized preference learning, we collect a +domain-specific preference (DSP) dataset, which collects preferred responses to +each given query from four practical domains. Besides, from the perspective of +data efficiency, we proposed a three-stage customized RM learning scheme, whose +effectiveness is empirically verified on both general preference datasets and +our DSP set. Furthermore, we test multiple training and data strategies on the +three learning stages, and have found several ways to better preserve the +general preferring ability while training the customized RMs, especially +general preference enrichment and customized preference imitation learning. The +DSP dataset and code are available at https://github.com/Linear95/DSP. + +
+
+
+
+
+ + ☆ Knowledge Solver: Teaching LLMs to Search for Domain Knowledge from + Knowledge Graphs + + +
+ Large language models (LLMs), such as ChatGPT and GPT-4, are versatile and +can solve different tasks due to their emergent ability and generalizability. +However, LLMs sometimes lack domain-specific knowledge to perform tasks, which +would also cause hallucination during inference. In some previous works, +additional modules like graph neural networks (GNNs) are trained on retrieved +knowledge from external knowledge bases, aiming to mitigate the problem of +lacking domain-specific knowledge. However, incorporating additional modules: +1) would need retraining additional modules when encountering novel domains; 2) +would become a bottleneck since LLMs' strong abilities are not fully utilized +for retrieval. In this paper, we propose a paradigm, termed Knowledge Solver +(KSL), to teach LLMs to search for essential knowledge from external knowledge +bases by harnessing their own strong generalizability. Specifically, we design +a simple yet effective prompt to transform retrieval into a multi-hop decision +sequence, which empowers LLMs with searching knowledge ability in zero-shot +manner. Additionally, KSL is able to provide complete retrieval paths and +therefore increase explainability of LLMs' reasoning processes. We conduct +experiments on three datasets: CommonsenseQA, OpenbookQA, and MedQA-USMLE, and +found that our approach improves LLM baseline performance by a relatively large +margin. + +
+
+
+
+
+ + ☆ ContrastWSD: Enhancing Metaphor Detection with Word Sense Disambiguation + Following the Metaphor Identification Procedure + + +
+ This paper presents ContrastWSD, a RoBERTa-based metaphor detection model +that integrates the Metaphor Identification Procedure (MIP) and Word Sense +Disambiguation (WSD) to extract and contrast the contextual meaning with the +basic meaning of a word to determine whether it is used metaphorically in a +sentence. By utilizing the word senses derived from a WSD model, our model +enhances the metaphor detection process and outperforms other methods that rely +solely on contextual embeddings or integrate only the basic definitions and +other external knowledge. We evaluate our approach on various benchmark +datasets and compare it with strong baselines, indicating the effectiveness in +advancing metaphor detection. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ A Multimodal Analysis of Influencer Content on Twitter AACL 2023 + + +
+ Influencer marketing involves a wide range of strategies in which brands +collaborate with popular content creators (i.e., influencers) to leverage their +reach, trust, and impact on their audience to promote and endorse products or +services. Because followers of influencers are more likely to buy a product +after receiving an authentic product endorsement rather than an explicit direct +product promotion, the line between personal opinions and commercial content +promotion is frequently blurred. This makes automatic detection of regulatory +compliance breaches related to influencer advertising (e.g., misleading +advertising or hidden sponsorships) particularly difficult. In this work, we +(1) introduce a new Twitter (now X) dataset consisting of 15,998 influencer +posts mapped into commercial and non-commercial categories for assisting in the +automatic detection of commercial influencer content; (2) experiment with an +extensive set of predictive models that combine text and visual information +showing that our proposed cross-attention approach outperforms state-of-the-art +multimodal models; and (3) conduct a thorough analysis of strengths and +limitations of our models. We show that multimodal modeling is useful for +identifying commercial posts, reducing the amount of false positives, and +capturing relevant context that aids in the discovery of undisclosed commercial +posts. + +
+
+ comment: Accepted at AACL 2023 +
+
+
+
+
+ + ☆ Persona-aware Generative Model for Code-mixed Language + + +
+ Code-mixing and script-mixing are prevalent across online social networks and +multilingual societies. However, a user's preference toward code-mixing depends +on the socioeconomic status, demographics of the user, and the local context, +which existing generative models mostly ignore while generating code-mixed +texts. In this work, we make a pioneering attempt to develop a persona-aware +generative model to generate texts resembling real-life code-mixed texts of +individuals. We propose a Persona-aware Generative Model for Code-mixed +Generation, PARADOX, a novel Transformer-based encoder-decoder model that +encodes an utterance conditioned on a user's persona and generates code-mixed +texts without monolingual reference data. We propose an alignment module that +re-calibrates the generated sequence to resemble real-life code-mixed texts. +PARADOX generates code-mixed texts that are semantically more meaningful and +linguistically more valid. To evaluate the personification capabilities of +PARADOX, we propose four new metrics -- CM BLEU, CM Rouge-1, CM Rouge-L and CM +KS. On average, PARADOX achieves 1.6 points better CM BLEU, 47% better +perplexity and 32% better semantic coherence than the non-persona-based +counterparts. + +
+
+ comment: 4 tables, 4 figures +
+
+
+
+
+ + ☆ Leave no Place Behind: Improved Geolocation in Humanitarian Documents + + +
+ Geographical location is a crucial element of humanitarian response, +outlining vulnerable populations, ongoing events, and available resources. +Latest developments in Natural Language Processing may help in extracting vital +information from the deluge of reports and documents produced by the +humanitarian sector. However, the performance and biases of existing +state-of-the-art information extraction tools are unknown. In this work, we +develop annotated resources to fine-tune the popular Named Entity Recognition +(NER) tools Spacy and roBERTa to perform geotagging of humanitarian texts. We +then propose a geocoding method FeatureRank which links the candidate locations +to the GeoNames database. We find that not only does the humanitarian-domain +data improves the performance of the classifiers (up to F1 = 0.92), but it also +alleviates some of the bias of the existing tools, which erroneously favor +locations in the Western countries. Thus, we conclude that more resources from +non-Western documents are necessary to ensure that off-the-shelf NER systems +are suitable for the deployment in the humanitarian sector. + +
+
+
+
+
+ + ☆ On the Challenges of Building Datasets for Hate Speech Detection + + +
+ Detection of hate speech has been formulated as a standalone application of +NLP and different approaches have been adopted for identifying the target +groups, obtaining raw data, defining the labeling process, choosing the +detection algorithm, and evaluating the performance in the desired setting. +However, unlike other downstream tasks, hate speech suffers from the lack of +large-sized, carefully curated, generalizable datasets owing to the highly +subjective nature of the task. In this paper, we first analyze the issues +surrounding hate speech detection through a data-centric lens. We then outline +a holistic framework to encapsulate the data creation pipeline across seven +broad dimensions by taking the specific example of hate speech towards sexual +minorities. We posit that practitioners would benefit from following this +framework as a form of best practice when creating hate speech datasets in the +future. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ☆ ViCGCN: Graph Convolutional Network with Contextualized Language Models + for Social Media Mining in Vietnamese + + +
+ Social media processing is a fundamental task in natural language processing +with numerous applications. As Vietnamese social media and information science +have grown rapidly, the necessity of information-based mining on Vietnamese +social media has become crucial. However, state-of-the-art research faces +several significant drawbacks, including imbalanced data and noisy data on +social media platforms. Imbalanced and noisy are two essential issues that need +to be addressed in Vietnamese social media texts. Graph Convolutional Networks +can address the problems of imbalanced and noisy data in text classification on +social media by taking advantage of the graph structure of the data. This study +presents a novel approach based on contextualized language model (PhoBERT) and +graph-based method (Graph Convolutional Networks). In particular, the proposed +approach, ViCGCN, jointly trained the power of Contextualized embeddings with +the ability of Graph Convolutional Networks, GCN, to capture more syntactic and +semantic dependencies to address those drawbacks. Extensive experiments on +various Vietnamese benchmark datasets were conducted to verify our approach. +The observation shows that applying GCN to BERTology models as the final layer +significantly improves performance. Moreover, the experiments demonstrate that +ViCGCN outperforms 13 powerful baseline models, including BERTology models, +fusion BERTology and GCN models, other baselines, and SOTA on three benchmark +social media datasets. Our proposed ViCGCN approach demonstrates a significant +improvement of up to 6.21%, 4.61%, and 2.63% over the best Contextualized +Language Models, including multilingual and monolingual, on three benchmark +datasets, UIT-VSMEC, UIT-ViCTSD, and UIT-VSFC, respectively. Additionally, our +integrated model ViCGCN achieves the best performance compared to other +BERTology integrated with GCN models. + +
+
+
+
+
+ + ☆ A deep Natural Language Inference predictor without language-specific + training data + + +
+ In this paper we present a technique of NLP to tackle the problem of +inference relation (NLI) between pairs of sentences in a target language of +choice without a language-specific training dataset. We exploit a generic +translation dataset, manually translated, along with two instances of the same +pre-trained model - the first to generate sentence embeddings for the source +language, and the second fine-tuned over the target language to mimic the +first. This technique is known as Knowledge Distillation. The model has been +evaluated over machine translated Stanford NLI test dataset, machine translated +Multi-Genre NLI test dataset, and manually translated RTE3-ITA test dataset. We +also test the proposed architecture over different tasks to empirically +demonstrate the generality of the NLI task. The model has been evaluated over +the native Italian ABSITA dataset, on the tasks of Sentiment Analysis, +Aspect-Based Sentiment Analysis, and Topic Recognition. We emphasise the +generality and exploitability of the Knowledge Distillation technique that +outperforms other methodologies based on machine translation, even though the +former was not directly trained on the data it was tested over. + +
+
+ comment: Conference: ICIAP2023 +
+
+
+
+
+ + ☆ Aligning Large Language Models for Clinical Tasks + + +
+ Large Language Models (LLMs) have demonstrated remarkable adaptability, +showcasing their capacity to excel in tasks for which they were not explicitly +trained. However, despite their impressive natural language processing (NLP) +capabilities, effective alignment of LLMs remains a crucial challenge when +deploying them for specific clinical applications. The ability to generate +responses with factually accurate content and to engage in non-trivial +reasoning steps are crucial for the LLMs to be eligible for applications in +clinical medicine. Employing a combination of techniques including +instruction-tuning and in-prompt strategies like few-shot and chain of thought +prompting has significantly enhanced the performance of LLMs. Our proposed +alignment strategy for medical question-answering, known as +'expand-guess-refine', offers a parameter and data-efficient solution. A +preliminary analysis of this method demonstrated outstanding performance, +achieving a score of 70.63% on a subset of questions sourced from the USMLE +dataset. + +
+
+ comment: 10 papers, 3 figures +
+
+
+
+
+ + ☆ Promoting Open-domain Dialogue Generation through Learning Pattern + Information between Contexts and Responses + + +
+ Recently, utilizing deep neural networks to build the opendomain dialogue +models has become a hot topic. However, the responses generated by these models +suffer from many problems such as responses not being contextualized and tend +to generate generic responses that lack information content, damaging the +user's experience seriously. Therefore, many studies try introducing more +information into the dialogue models to make the generated responses more vivid +and informative. Unlike them, this paper improves the quality of generated +responses by learning the implicit pattern information between contexts and +responses in the training samples. In this paper, we first build an open-domain +dialogue model based on the pre-trained language model (i.e., GPT-2). And then, +an improved scheduled sampling method is proposed for pre-trained models, by +which the responses can be used to guide the response generation in the +training phase while avoiding the exposure bias problem. More importantly, we +design a response-aware mechanism for mining the implicit pattern information +between contexts and responses so that the generated replies are more diverse +and approximate to human replies. Finally, we evaluate the proposed model (RAD) +on the Persona-Chat and DailyDialog datasets; and the experimental results show +that our model outperforms the baselines on most automatic and manual metrics. + +
+
+
+
+
+ + ☆ Agent-based simulation of pedestrians' earthquake evacuation; + application to Beirut, Lebanon + + +
+ Most seismic risk assessment methods focus on estimating the damages to the +built environment and the consequent socioeconomic losses without fully taking +into account the social aspect of risk. Yet, human behaviour is a key element +in predicting the human impact of an earthquake, therefore, it is important to +include it in quantitative risk assessment studies. In this study, an +interdisciplinary approach simulating pedestrians' evacuation during +earthquakes at the city scale is developed using an agent-based model. The +model integrates the seismic hazard, the physical vulnerability as well as +individuals' behaviours and mobility. The simulator is applied to the case of +Beirut, Lebanon. Lebanon is at the heart of the Levant fault system that has +generated several Mw>7 earthquakes, the latest being in 1759. It is one of the +countries with the highest seismic risk in the Mediterranean region. This is +due to the high seismic vulnerability of the buildings due to the absence of +mandatory seismic regulation until 2012, the high level of urbanization, and +the lack of adequate spatial planning and risk prevention policies. Beirut as +the main residential, economic and institutional hub of Lebanon is densely +populated. To accommodate the growing need for urban development, constructions +have almost taken over all of the green areas of the city; squares and gardens +are disappearing to give place to skyscrapers. However, open spaces are safe +places to shelter, away from debris, and therefore play an essential role in +earthquake evacuation. Despite the massive urbanization, there are a few open +spaces but locked gates and other types of anthropogenic barriers often limit +their access. To simulate this complex context, pedestrians' evacuation +simulations are run in a highly realistic spatial environment implemented in +GAMA [1]. Previous data concerning soil and buildings in Beirut [2, 3] are +complemented by new geographic data extracted from high-resolution Pleiades +satellite images. The seismic loading is defined as a peak ground acceleration +of 0.3g, as stated in Lebanese seismic regulations. Building damages are +estimated using an artificial neural network trained to predict the mean damage +[4] based on the seismic loading as well as the soil and building vibrational +properties [5]. Moreover, the quantity and the footprint of the generated +debris around each building are also estimated and included in the model. We +simulate how topography, buildings, debris, and access to open spaces, affect +individuals' mobility. Two city configurations are implemented: 1. Open spaces +are accessible without any barriers; 2. Access to some open spaces is blocked. +The first simulation results show that while 52% of the population is able to +arrive to an open space within 5 minutes after an earthquake, this number is +reduced to 39% when one of the open spaces is locked. These results show that +the presence of accessible open spaces in a city and their proximity to the +residential buildings is a crucial factor for ensuring people's safety when an +earthquake occurs. + +
+
+
+
+
+ + ☆ Norm Tweaking: High-performance Low-bit Quantization of Large Language + Models + + +
+ As the size of large language models (LLMs) continues to grow, model +compression without sacrificing accuracy has become a crucial challenge for +deployment. While some quantization methods, such as GPTQ, have made progress +in achieving acceptable 4-bit weight-only quantization, attempts at lower bit +quantization often result in severe performance degradation. In this paper, we +introduce a technique called norm tweaking, which can be used as a plugin in +current PTQ methods to achieve high precision while being cost-efficient. Our +approach is inspired by the observation that rectifying the quantized +activation distribution to match its float counterpart can readily restore +accuracy for LLMs. To achieve this, we carefully design a tweaking strategy +that includes calibration data generation and channel-wise distance constraint +to update the weights of normalization layers for better generalization. We +conduct extensive experiments on various datasets using several open-sourced +LLMs. Our method demonstrates significant improvements in both weight-only +quantization and joint quantization of weights and activations, surpassing +existing PTQ methods. On GLM-130B and OPT-66B, our method even achieves the +same level of accuracy at 2-bit quantization as their float ones. Our simple +and effective approach makes it more practical for real-world applications. + +
+
+
+
+
+ + ☆ GRASS: Unified Generation Model for Speech Semantic Understanding + + +
+ This paper explores the instruction fine-tuning technique for speech semantic +understanding by introducing a unified end-to-end (E2E) framework that +generates semantic labels conditioned on a task-related prompt for audio data. +We pre-train the model using large and diverse data, where instruction-speech +pairs are constructed via a text-to-speech (TTS) system. Extensive experiments +demonstrate that our proposed model significantly outperforms state-of-the-art +(SOTA) models after fine-tuning downstream tasks. Furthermore, the proposed +model achieves competitive performance in zero-shot and few-shot scenarios. To +facilitate future work on instruction fine-tuning for speech-to-semantic tasks, +we release our instruction dataset and code. + +
+
+
+
+
+ + ☆ Improving Code Generation by Dynamic Temperature Sampling + + +
+ Recently, Large Language Models (LLMs) have shown impressive results in code +generation. However, existing decoding strategies are designed for Natural +Language (NL) generation, overlooking the differences between NL and +programming languages (PL). Due to this oversight, a better decoding strategy +for code generation remains an open question. In this paper, we conduct the +first systematic study to explore a decoding strategy specialized in code +generation. With an analysis of loss distributions of code tokens, we find that +code tokens can be divided into two categories: challenging tokens that are +difficult to predict and confident tokens that can be easily inferred. Among +them, the challenging tokens mainly appear at the beginning of a code block. +Inspired by the above findings, we propose a simple yet effective method: +Adaptive Temperature (AdapT) sampling, which dynamically adjusts the +temperature coefficient when decoding different tokens. We apply a larger +temperature when sampling for challenging tokens, allowing LLMs to explore +diverse choices. We employ a smaller temperature for confident tokens avoiding +the influence of tail randomness noises. We apply AdapT sampling to LLMs with +different sizes and conduct evaluations on two popular datasets. Results show +that AdapT sampling significantly outperforms state-of-the-art decoding +strategy. + +
+
+
+
+
+ + ☆ Rubric-Specific Approach to Automated Essay Scoring with Augmentation + Training + + +
+ Neural based approaches to automatic evaluation of subjective responses have +shown superior performance and efficiency compared to traditional rule-based +and feature engineering oriented solutions. However, it remains unclear whether +the suggested neural solutions are sufficient replacements of human raters as +we find recent works do not properly account for rubric items that are +essential for automated essay scoring during model training and validation. In +this paper, we propose a series of data augmentation operations that train and +test an automated scoring model to learn features and functions overlooked by +previous works while still achieving state-of-the-art performance in the +Automated Student Assessment Prize dataset. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ HC3 Plus: A Semantic-Invariant Human ChatGPT Comparison Corpus + + +
+ ChatGPT has gained significant interest due to its impressive performance, +but people are increasingly concerned about its potential risks, particularly +around the detection of AI-generated content (AIGC), which is often difficult +for untrained humans to identify. Current datasets utilized for detecting +ChatGPT-generated text primarily center around question-answering, yet they +tend to disregard tasks that possess semantic-invariant properties, such as +summarization, translation, and paraphrasing. Our primary studies demonstrate +that detecting model-generated text on semantic-invariant tasks is more +difficult. To fill this gap, we introduce a more extensive and comprehensive +dataset that considers more types of tasks than previous work, including +semantic-invariant tasks. In addition, the model after a large number of task +instruction fine-tuning shows a strong powerful performance. Owing to its +previous success, we further instruct fine-tuning Tk-instruct and built a more +powerful detection system. Experimental results show that our proposed detector +outperforms the previous state-of-the-art RoBERTa-based detector. + +
+
+
+
+
+ + ☆ Large Language Models for Automated Open-domain Scientific Hypotheses + Discovery + + +
+ Hypothetical induction is recognized as the main reasoning type when +scientists make observations about the world and try to propose hypotheses to +explain those observations. Past research on hypothetical induction has a +limited setting that (1) the observation annotations of the dataset are not raw +web corpus but are manually selected sentences (resulting in a close-domain +setting); and (2) the ground truth hypotheses annotations are mostly +commonsense knowledge, making the task less challenging. In this work, we +propose the first NLP dataset for social science academic hypotheses discovery, +consisting of 50 recent papers published in top social science journals. Raw +web corpora that are necessary for developing hypotheses in the published +papers are also collected in the dataset, with the final goal of creating a +system that automatically generates valid, novel, and helpful (to human +researchers) hypotheses, given only a pile of raw web corpora. The new dataset +can tackle the previous problems because it requires to (1) use raw web corpora +as observations; and (2) propose hypotheses even new to humanity. A +multi-module framework is developed for the task, as well as three different +feedback mechanisms that empirically show performance gain over the base +framework. Finally, our framework exhibits high performance in terms of both +GPT-4 based evaluation and social science expert evaluation. + +
+
+
+
+
+ + ☆ Offensive Hebrew Corpus and Detection using BERT CCS + + +
+ Offensive language detection has been well studied in many languages, but it +is lagging behind in low-resource languages, such as Hebrew. In this paper, we +present a new offensive language corpus in Hebrew. A total of 15,881 tweets +were retrieved from Twitter. Each was labeled with one or more of five classes +(abusive, hate, violence, pornographic, or none offensive) by Arabic-Hebrew +bilingual speakers. The annotation process was challenging as each annotator is +expected to be familiar with the Israeli culture, politics, and practices to +understand the context of each tweet. We fine-tuned two Hebrew BERT models, +HeBERT and AlephBERT, using our proposed dataset and another published dataset. +We observed that our data boosts HeBERT performance by 2% when combined with +D_OLaH. Fine-tuning AlephBERT on our data and testing on D_OLaH yields 69% +accuracy, while fine-tuning on D_OLaH and testing on our data yields 57% +accuracy, which may be an indication to the generalizability our data offers. +Our dataset and fine-tuned models are available on GitHub and Huggingface. + +
+
+ comment: 8 pages, 1 figure, The 20th ACS/IEEE International Conference on + Computer Systems and Applications (AICCSA) +
+
+
+
+
+ + ☆ HAE-RAE Bench: Evaluation of Korean Knowledge in Language Models + + +
+ Large Language Models (LLMs) pretrained on massive corpora exhibit remarkable +capabilities across a wide range of tasks, however, the attention given to +non-English languages has been limited in this field of research. To address +this gap and assess the proficiency of language models in the Korean language +and culture, we present HAE-RAE Bench, covering 6 tasks including vocabulary, +history, and general knowledge. Our evaluation of language models on this +benchmark highlights the potential advantages of employing Large +Language-Specific Models(LLSMs) over a comprehensive, universal model like +GPT-3.5. Remarkably, our study reveals that models approximately 13 times +smaller than GPT-3.5 can exhibit similar performance levels in terms of +language-specific knowledge retrieval. This observation underscores the +importance of homogeneous corpora for training professional-level +language-specific models. On the contrary, we also observe a perplexing +performance dip in these smaller LMs when they are tasked to generate +structured answers. + +
+
+
+
+
+ + ☆ Certifying LLM Safety against Adversarial Prompting + + +
+ Large language models (LLMs) released for public use incorporate guardrails +to ensure their output is safe, often referred to as "model alignment." An +aligned language model should decline a user's request to produce harmful +content. However, such safety measures are vulnerable to adversarial prompts, +which contain maliciously designed token sequences to circumvent the model's +safety guards and cause it to produce harmful content. In this work, we +introduce erase-and-check, the first framework to defend against adversarial +prompts with verifiable safety guarantees. We erase tokens individually and +inspect the resulting subsequences using a safety filter. Our procedure labels +the input prompt as harmful if any subsequences or the input prompt are +detected as harmful by the filter. This guarantees that any adversarial +modification of a harmful prompt up to a certain size is also labeled harmful. +We defend against three attack modes: i) adversarial suffix, which appends an +adversarial sequence at the end of the prompt; ii) adversarial insertion, where +the adversarial sequence is inserted anywhere in the middle of the prompt; and +iii) adversarial infusion, where adversarial tokens are inserted at arbitrary +positions in the prompt, not necessarily as a contiguous block. Empirical +results demonstrate that our technique obtains strong certified safety +guarantees on harmful prompts while maintaining good performance on safe +prompts. For example, against adversarial suffixes of length 20, it certifiably +detects 93% of the harmful prompts and labels 94% of the safe prompts as safe +using the open source language model Llama 2 as the safety filter. + +
+
+
+
+
+ + ☆ A Joint Study of Phrase Grounding and Task Performance in Vision and + Language Models + + +
+ Key to tasks that require reasoning about natural language in visual contexts +is grounding words and phrases to image regions. However, observing this +grounding in contemporary models is complex, even if it is generally expected +to take place if the task is addressed in a way that is conductive to +generalization. We propose a framework to jointly study task performance and +phrase grounding, and propose three benchmarks to study the relation between +the two. Our results show that contemporary models demonstrate inconsistency +between their ability to ground phrases and solve tasks. We show how this can +be addressed through brute-force training on ground phrasing annotations, and +analyze the dynamics it creates. Code and at available at +https://github.com/lil-lab/phrase_grounding. + +
+
+
+
+
+ + ☆ Zero-Resource Hallucination Prevention for Large Language Models + + +
+ The prevalent use of large language models (LLMs) in various domains has +drawn attention to the issue of "hallucination," which refers to instances +where LLMs generate factually inaccurate or ungrounded information. Existing +techniques for hallucination detection in language assistants rely on intricate +fuzzy, specific free-language-based chain of thought (CoT) techniques or +parameter-based methods that suffer from interpretability issues. Additionally, +the methods that identify hallucinations post-generation could not prevent +their occurrence and suffer from inconsistent performance due to the influence +of the instruction format and model style. In this paper, we introduce a novel +pre-detection self-evaluation technique, referred to as {\method}, which +focuses on evaluating the model's familiarity with the concepts present in the +input instruction and withholding the generation of response in case of +unfamiliar concepts. This approach emulates the human ability to refrain from +responding to unfamiliar topics, thus reducing hallucinations. We validate +{\method} across four different large language models, demonstrating +consistently superior performance compared to existing techniques. Our findings +propose a significant shift towards preemptive strategies for hallucination +mitigation in LLM assistants, promising improvements in reliability, +applicability, and interpretability. + +
+
+
+
+
+ + ☆ Epi-Curriculum: Episodic Curriculum Learning for Low-Resource Domain + Adaptation in Neural Machine Translation + + +
+ Neural Machine Translation (NMT) models have become successful, but their +performance remains poor when translating on new domains with a limited number +of data. In this paper, we present a novel approach Epi-Curriculum to address +low-resource domain adaptation (DA), which contains a new episodic training +framework along with denoised curriculum learning. Our episodic training +framework enhances the model's robustness to domain shift by episodically +exposing the encoder/decoder to an inexperienced decoder/encoder. The denoised +curriculum learning filters the noised data and further improves the model's +adaptability by gradually guiding the learning process from easy to more +difficult tasks. Experiments on English-German and English-Romanian translation +show that: (i) Epi-Curriculum improves both model's robustness and adaptability +in seen and unseen domains; (ii) Our episodic training framework enhances the +encoder and decoder's robustness to domain shift. + +
+
+
+
+
+ + ☆ RoDia: A New Dataset for Romanian Dialect Identification from Speech + + +
+ Dialect identification is a critical task in speech processing and language +technology, enhancing various applications such as speech recognition, speaker +verification, and many others. While most research studies have been dedicated +to dialect identification in widely spoken languages, limited attention has +been given to dialect identification in low-resource languages, such as +Romanian. To address this research gap, we introduce RoDia, the first dataset +for Romanian dialect identification from speech. The RoDia dataset includes a +varied compilation of speech samples from five distinct regions of Romania, +covering both urban and rural environments, totaling 2 hours of manually +annotated speech data. Along with our dataset, we introduce a set of +competitive models to be used as baselines for future research. The top scoring +model achieves a macro F1 score of 59.83% and a micro F1 score of 62.08%, +indicating that the task is challenging. We thus believe that RoDia is a +valuable resource that will stimulate research aiming to address the challenges +of Romanian dialect identification. We publicly release our dataset and code at +https://github.com/codrut2/RoDia. + +
+
+
+
+
+ + ☆ Parameter Efficient Audio Captioning With Faithful Guidance Using + Audio-text Shared Latent Representation + + +
+ There has been significant research on developing pretrained transformer +architectures for multimodal-to-text generation tasks. Albeit performance +improvements, such models are frequently overparameterized, hence suffer from +hallucination and large memory footprint making them challenging to deploy on +edge devices. In this paper, we address both these issues for the application +of automated audio captioning. First, we propose a data augmentation technique +for generating hallucinated audio captions and show that similarity based on an +audio-text shared latent space is suitable for detecting hallucination. Then, +we propose a parameter efficient inference time faithful decoding algorithm +that enables smaller audio captioning models with performance equivalent to +larger models trained with more data. During the beam decoding step, the +smaller model utilizes an audio-text shared latent representation to +semantically align the generated text with corresponding input audio. Faithful +guidance is introduced into the beam probability by incorporating the cosine +similarity between latent representation projections of greedy rolled out +intermediate beams and audio clip. We show the efficacy of our algorithm on +benchmark datasets and evaluate the proposed scheme against baselines using +conventional audio captioning and semantic similarity metrics while +illustrating tradeoffs between performance and complexity. + +
+
+ comment: 5 pages, 5 tables, 1 figure +
+
+
+
+
+ + ☆ GPT Can Solve Mathematical Problems Without a Calculator + + +
+ Previous studies have typically assumed that large language models are unable +to accurately perform arithmetic operations, particularly multiplication of >8 +digits, and operations involving decimals and fractions, without the use of +calculator tools. This paper aims to challenge this misconception. With +sufficient training data, a 2 billion-parameter language model can accurately +perform multi-digit arithmetic operations with almost 100% accuracy without +data leakage, significantly surpassing GPT-4 (whose multi-digit multiplication +accuracy is only 4.3%). We also demonstrate that our MathGLM, fine-tuned from +GLM-10B on a dataset with additional multi-step arithmetic operations and math +problems described in text, achieves similar performance to GPT-4 on a +5,000-samples Chinese math problem test set. + +
+
+ comment: 26pages,14figures +
+
+
+
+
+ + ☆ Implicit Design Choices and Their Impact on Emotion Recognition Model + Development and Evaluation + + +
+ Emotion recognition is a complex task due to the inherent subjectivity in +both the perception and production of emotions. The subjectivity of emotions +poses significant challenges in developing accurate and robust computational +models. This thesis examines critical facets of emotion recognition, beginning +with the collection of diverse datasets that account for psychological factors +in emotion production. + To handle the challenge of non-representative training data, this work +collects the Multimodal Stressed Emotion dataset, which introduces controlled +stressors during data collection to better represent real-world influences on +emotion production. To address issues with label subjectivity, this research +comprehensively analyzes how data augmentation techniques and annotation +schemes impact emotion perception and annotator labels. It further handles +natural confounding variables and variations by employing adversarial networks +to isolate key factors like stress from learned emotion representations during +model training. For tackling concerns about leakage of sensitive demographic +variables, this work leverages adversarial learning to strip sensitive +demographic information from multimodal encodings. Additionally, it proposes +optimized sociological evaluation metrics aligned with cost-effective, +real-world needs for model testing. + This research advances robust, practical emotion recognition through +multifaceted studies of challenges in datasets, labels, modeling, demographic +and membership variable encoding in representations, and evaluation. The +groundwork has been laid for cost-effective, generalizable emotion recognition +models that are less likely to encode sensitive demographic information. + +
+
+ comment: PhD Thesis +
+
+
+
+
+ + ♻ ☆ CodeApex: A Bilingual Programming Evaluation Benchmark for Large + Language Models + + +
+ With the emergence of Large Language Models (LLMs), there has been a +significant improvement in the programming capabilities of models, attracting +growing attention from researchers. We propose CodeApex, a bilingual benchmark +dataset focusing on the programming comprehension and code generation abilities +of LLMs. CodeApex comprises three types of multiple-choice questions: +conceptual understanding, commonsense reasoning, and multi-hop reasoning, +designed to evaluate LLMs on programming comprehension tasks. Additionally, +CodeApex utilizes algorithmic questions and corresponding test cases to assess +the code quality generated by LLMs. We evaluate 14 state-of-the-art LLMs, +including both general-purpose and specialized models. GPT exhibits the best +programming capabilities, achieving approximate accuracies of 50% and 56% on +the two tasks, respectively. There is still significant room for improvement in +programming tasks. We hope that CodeApex can serve as a reference for +evaluating the coding capabilities of LLMs, further promoting their development +and growth. Datasets are released at https://github.com/APEXLAB/CodeApex.git. +CodeApex submission website is https://apex.sjtu.edu.cn/codeapex/. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Single-Sentence Reader: A Novel Approach for Addressing Answer Position + Bias + + +
+ Machine Reading Comprehension (MRC) models tend to take advantage of spurious +correlations (also known as dataset bias or annotation artifacts in the +research community). Consequently, these models may perform the MRC task +without fully comprehending the given context and question, which is +undesirable since it may result in low robustness against distribution shift. +The main focus of this paper is answer-position bias, where a significant +percentage of training questions have answers located solely in the first +sentence of the context. We propose a Single-Sentence Reader as a new approach +for addressing answer position bias in MRC. Remarkably, in our experiments with +six different models, our proposed Single-Sentence Readers trained on biased +dataset achieve results that nearly match those of models trained on normal +dataset, proving their effectiveness in addressing the answer position bias. +Our study also discusses several challenges our Single-Sentence Readers +encounter and proposes a potential solution. + +
+
+ comment: 10 pages, 5 tables, 2 figures +
+
+
+
+
+ + ♻ ☆ Learning Speech Representation From Contrastive Token-Acoustic + Pretraining + + +
+ For fine-grained generation and recognition tasks such as +minimally-supervised text-to-speech (TTS), voice conversion (VC), and automatic +speech recognition (ASR), the intermediate representations extracted from +speech should serve as a "bridge" between text and acoustic information, +containing information from both modalities. The semantic content is +emphasized, while the paralinguistic information such as speaker identity and +acoustic details should be de-emphasized. However, existing methods for +extracting fine-grained intermediate representations from speech suffer from +issues of excessive redundancy and dimension explosion. Contrastive learning is +a good method for modeling intermediate representations from two modalities. +However, existing contrastive learning methods in the audio field focus on +extracting global descriptive information for downstream audio classification +tasks, making them unsuitable for TTS, VC, and ASR tasks. To address these +issues, we propose a method named "Contrastive Token-Acoustic Pretraining +(CTAP)", which uses two encoders to bring phoneme and speech into a joint +multimodal space, learning how to connect phoneme and speech at the frame +level. The CTAP model is trained on 210k speech and phoneme text pairs, +achieving minimally-supervised TTS, VC, and ASR. The proposed CTAP method +offers a promising solution for fine-grained generation and recognition +downstream tasks in speech processing. + +
+
+
+
+
+ + ♻ ☆ Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition + and Translation + + +
+ In this paper, we devise a mechanism for the addition of multi-modal +information with an existing pipeline for continuous sign language recognition +and translation. In our procedure, we have incorporated optical flow +information with RGB images to enrich the features with movement-related +information. This work studies the feasibility of such modality inclusion using +a cross-modal encoder. The plugin we have used is very lightweight and doesn't +need to include a separate feature extractor for the new modality in an +end-to-end manner. We have applied the changes in both sign language +recognition and translation, improving the result in each case. We have +evaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language +recognition and the RWTH-PHOENIX-2014T dataset for translation. On the +recognition task, our approach reduced the WER by 0.9, and on the translation +task, our approach increased most of the BLEU scores by ~0.6 on the test set. + +
+
+ comment: This version has some errors. Our schedule is packed, so we don't + have enough time to correct it. We will share another work when we have time + to fix this +
+
+
+
+
+ + ♻ ☆ An Empirical Analysis for Zero-Shot Multi-Label Classification on + COVID-19 CT Scans and Uncurated Reports ICCV + + +
+ The pandemic resulted in vast repositories of unstructured data, including +radiology reports, due to increased medical examinations. Previous research on +automated diagnosis of COVID-19 primarily focuses on X-ray images, despite +their lower precision compared to computed tomography (CT) scans. In this work, +we leverage unstructured data from a hospital and harness the fine-grained +details offered by CT scans to perform zero-shot multi-label classification +based on contrastive visual language learning. In collaboration with human +experts, we investigate the effectiveness of multiple zero-shot models that aid +radiologists in detecting pulmonary embolisms and identifying intricate lung +details like ground glass opacities and consolidations. Our empirical analysis +provides an overview of the possible solutions to target such fine-grained +tasks, so far overlooked in the medical multimodal pretraining literature. Our +investigation promises future advancements in the medical image analysis +community by addressing some challenges associated with unstructured data and +fine-grained multi-label classification. + +
+
+ comment: Proceedings of the IEEE/CVF International Conference on Computer + Vision (ICCV) Workshops 2023 +
+
+
+
+
+ + ♻ ☆ NNKGC: Improving Knowledge Graph Completion with Node Neighborhoods ISWC 2023 + + +
+ Knowledge graph completion (KGC) aims to discover missing relations of query +entities. Current text-based models utilize the entity name and description to +infer the tail entity given the head entity and a certain relation. Existing +approaches also consider the neighborhood of the head entity. However, these +methods tend to model the neighborhood using a flat structure and are only +restricted to 1-hop neighbors. In this work, we propose a node +neighborhood-enhanced framework for knowledge graph completion. It models the +head entity neighborhood from multiple hops using graph neural networks to +enrich the head node information. Moreover, we introduce an additional edge +link prediction task to improve KGC. Evaluation on two public datasets shows +that this framework is simple yet effective. The case study also shows that the +model is able to predict explainable predictions. + +
+
+ comment: DL4KG Workshop, ISWC 2023 +
+
+
+
+
+ + ♻ ☆ A Survey on Measuring and Mitigating Reasoning Shortcuts in Machine + Reading Comprehension + + +
+ The issue of shortcut learning is widely known in NLP and has been an +important research focus in recent years. Unintended correlations in the data +enable models to easily solve tasks that were meant to exhibit advanced +language understanding and reasoning capabilities. In this survey paper, we +focus on the field of machine reading comprehension (MRC), an important task +for showcasing high-level language understanding that also suffers from a range +of shortcuts. We summarize the available techniques for measuring and +mitigating shortcuts and conclude with suggestions for further progress in +shortcut research. Importantly, we highlight two concerns for shortcut +mitigation in MRC: (1) the lack of public challenge sets, a necessary component +for effective and reusable evaluation, and (2) the lack of certain mitigation +techniques that are prominent in other areas. + +
+
+ comment: 18 pages, 2 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Substitution-based Semantic Change Detection using Contextual Embeddings + + +
+ Measuring semantic change has thus far remained a task where methods using +contextual embeddings have struggled to improve upon simpler techniques relying +only on static word vectors. Moreover, many of the previously proposed +approaches suffer from downsides related to scalability and ease of +interpretation. We present a simplified approach to measuring semantic change +using contextual embeddings, relying only on the most probable substitutes for +masked terms. Not only is this approach directly interpretable, it is also far +more efficient in terms of storage, achieves superior average performance +across the most frequently cited datasets for this task, and allows for more +nuanced investigation of change than is possible with static word vectors. + +
+
+
+
+
+ + ♻ ☆ Layout and Task Aware Instruction Prompt for Zero-shot Document Image + Question Answering + + +
+ The pre-training-fine-tuning paradigm based on layout-aware multimodal +pre-trained models has achieved significant progress on document image question +answering. However, domain pre-training and task fine-tuning for additional +visual, layout, and task modules prevent them from directly utilizing +off-the-shelf instruction-tuning language foundation models, which have +recently shown promising potential in zero-shot learning. Contrary to aligning +language models to the domain of document image question answering, we align +document image question answering to off-the-shell instruction-tuning language +foundation models to utilize their zero-shot capability. Specifically, we +propose layout and task aware instruction prompt called LATIN-Prompt, which +consists of layout-aware document content and task-aware descriptions. The +former recovers the layout information among text segments from OCR tools by +appropriate spaces and line breaks. The latter ensures that the model generates +answers that meet the requirements, especially format requirements, through a +detailed description of task. Experimental results on three benchmarks show +that LATIN-Prompt can improve the zero-shot performance of instruction-tuning +language foundation models on document image question answering and help them +achieve comparable levels to SOTAs based on the pre-training-fine-tuning +paradigm. Quantitative analysis and qualitative analysis demonstrate the +effectiveness of LATIN-Prompt. We provide the code in supplementary and will +release the code to facilitate future research. + +
+
+ comment: Add the LATIN-Tuning for Alapca. Code is available at + https://github.com/WenjinW/LATIN-Prompt +
+
+
+
+
+ + ♻ ☆ Large Language Models on Wikipedia-Style Survey Generation: an + Evaluation in NLP Concepts + + +
+ Large Language Models (LLMs) have achieved significant success across various +natural language processing (NLP) tasks, encompassing question-answering, +summarization, and machine translation, among others. While LLMs excel in +general tasks, their efficacy in domain-specific applications remains under +exploration. Additionally, LLM-generated text sometimes exhibits issues like +hallucination and disinformation. In this study, we assess LLMs' capability of +producing concise survey articles within the computer science-NLP domain, +focusing on 20 chosen topics. Automated evaluations indicate that GPT-4 +outperforms GPT-3.5 when benchmarked against the ground truth. Furthermore, +four human evaluators provide insights from six perspectives across four model +configurations. Through case studies, we demonstrate that while GPT often +yields commendable results, there are instances of shortcomings, such as +incomplete information and the exhibition of lapses in factual accuracy. + +
+
+
+
+
+ + ♻ ☆ Wordle: A Microcosm of Life. Luck, Skill, Cheating, Loyalty, and + Influence! + + +
+ Wordle is a popular, online word game offered by the New York Times +(nytimes.com). Currently there are some 2 million players of the English +version worldwide. Players have 6 attempts to guess the daily word (target +word) and after each attempt, the player receives color-coded information about +the correctness and position of each letter in the guess. After either a +successful completion of the puzzle or the final unsuccessful attempt, software +can assess the player's luck and skill using Information Theory and can display +data for the first, second, ..., sixth guesses of a random sample of all +players. Recently, I discovered that the latter data is presented in a format +that can easily be copied and pasted into a spreadsheet. I compiled data on +Wordle players' first guesses from May 2023 - August 2023 and inferred some +interesting information about Wordle players. A) Every day, about 0.2-0.5% of +players solve the puzzle in one attempt. Because the odds of guessing the one +of 2,315 possible target words at random is 0.043%, this implies that 4,000 - +10,000 players cheat by obtaining the target word outside of playing the game! +B) At least 1/3 of the players have a favorite starting word, or cycle through +several. And even though players should be aware that target words are never +repeated, most players appear to remain loyal to their starting word even after +its appearance as a target word. C) On August 15, 2023, about 30,000 players +abruptly changed their starting word, presumably based on a crossword puzzle +clue! Wordle players can be influenced! This study goes beyond social media +postings, surveys, and Google Trends to provide solid, quantitative evidence +about cheating in Wordle. + +
+
+
+
+
+ + ♻ ☆ Continual Pre-Training of Large Language Models: How to (re)warm your + model? + + +
+ Large language models (LLMs) are routinely pre-trained on billions of tokens, +only to restart the process over again once new data becomes available. A much +cheaper and more efficient solution would be to enable the continual +pre-training of these models, i.e. updating pre-trained models with new data +instead of re-training them from scratch. However, the distribution shift +induced by novel data typically results in degraded performance on past data. +Taking a step towards efficient continual pre-training, in this work, we +examine the effect of different warm-up strategies. Our hypothesis is that the +learning rate must be re-increased to improve compute efficiency when training +on a new dataset. We study the warmup phase of models pre-trained on the Pile +(upstream data, 300B tokens) as we continue to pre-train on SlimPajama +(downstream data, 297B tokens), following a linear warmup and cosine decay +schedule. We conduct all experiments on the Pythia 410M language model +architecture and evaluate performance through validation perplexity. We +experiment with different pre-training checkpoints, various maximum learning +rates, and various warmup lengths. Our results show that while rewarming models +first increases the loss on upstream and downstream data, in the longer run it +improves the downstream performance, outperforming models trained from +scratch$\unicode{x2013}$even for a large downstream dataset. + +
+
+
+
+
+ + ♻ ☆ Kernelized Concept Erasure EMNLP22 + + +
+ The representation space of neural models for textual data emerges in an +unsupervised manner during training. Understanding how those representations +encode human-interpretable concepts is a fundamental problem. One prominent +approach for the identification of concepts in neural representations is +searching for a linear subspace whose erasure prevents the prediction of the +concept from the representations. However, while many linear erasure algorithms +are tractable and interpretable, neural networks do not necessarily represent +concepts in a linear manner. To identify non-linearly encoded concepts, we +propose a kernelization of a linear minimax game for concept erasure. We +demonstrate that it is possible to prevent specific non-linear adversaries from +predicting the concept. However, the protection does not transfer to different +nonlinear adversaries. Therefore, exhaustively erasing a non-linearly encoded +concept remains an open problem. + +
+
+ comment: Accepted as a long paper in EMNLP22 +
+
+
+
+
+ + ♻ ☆ Publicly Shareable Clinical Large Language Model Built on Synthetic + Clinical Notes + + +
+ The development of large language models tailored for handling patients' +clinical notes is often hindered by the limited accessibility and usability of +these notes due to strict privacy regulations. To address these challenges, we +first create synthetic large-scale clinical notes using publicly available case +reports extracted from biomedical literature. We then use these synthetic notes +to train our specialized clinical large language model, Asclepius. While +Asclepius is trained on synthetic data, we assess its potential performance in +real-world applications by evaluating it using real clinical notes. We +benchmark Asclepius against several other large language models, including +GPT-3.5-turbo and other open-source alternatives. To further validate our +approach using synthetic notes, we also compare Asclepius with its variants +trained on real clinical notes. Our findings convincingly demonstrate that +synthetic clinical notes can serve as viable substitutes for real ones when +constructing high-performing clinical language models. This conclusion is +supported by detailed evaluations conducted by both GPT-4 and medical +professionals. All resources including weights, codes, and data used in the +development of Asclepius are made publicly accessible for future research. + +
+
+ comment: https://github.com/starmpcc/Asclepius +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 111 + +
+
+
+ + ☆ My Art My Choice: Adversarial Protection Against Unruly AI + + +
+ Generative AI is on the rise, enabling everyone to produce realistic content +via publicly available interfaces. Especially for guided image generation, +diffusion models are changing the creator economy by producing high quality low +cost content. In parallel, artists are rising against unruly AI, since their +artwork are leveraged, distributed, and dissimulated by large generative +models. Our approach, My Art My Choice (MAMC), aims to empower content owners +by protecting their copyrighted materials from being utilized by diffusion +models in an adversarial fashion. MAMC learns to generate adversarially +perturbed "protected" versions of images which can in turn "break" diffusion +models. The perturbation amount is decided by the artist to balance distortion +vs. protection of the content. MAMC is designed with a simple UNet-based +generator, attacking black box diffusion models, combining several losses to +create adversarial twins of the original artwork. We experiment on three +datasets for various image-to-image tasks, with different user control values. +Both protected image and diffusion output results are evaluated in visual, +noise, structure, pixel, and generative spaces to validate our claims. We +believe that MAMC is a crucial step for preserving ownership information for AI +generated content in a flawless, based-on-need, and human-centric way. + +
+
+
+
+
+ + ☆ Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields + + +
+ Neural Radiance Fields (NeRFs) have shown promise in applications like view +synthesis and depth estimation, but learning from multiview images faces +inherent uncertainties. Current methods to quantify them are either heuristic +or computationally demanding. We introduce BayesRays, a post-hoc framework to +evaluate uncertainty in any pre-trained NeRF without modifying the training +process. Our method establishes a volumetric uncertainty field using spatial +perturbations and a Bayesian Laplace approximation. We derive our algorithm +statistically and show its superior performance in key metrics and +applications. Additional results available at: https://bayesrays.github.io. + +
+
+
+
+
+ + ☆ 3D Transformer based on deformable patch location for differential + diagnosis between Alzheimer's disease and Frontotemporal dementia + + +
+ Alzheimer's disease and Frontotemporal dementia are common types of +neurodegenerative disorders that present overlapping clinical symptoms, making +their differential diagnosis very challenging. Numerous efforts have been done +for the diagnosis of each disease but the problem of multi-class differential +diagnosis has not been actively explored. In recent years, transformer-based +models have demonstrated remarkable success in various computer vision tasks. +However, their use in disease diagnostic is uncommon due to the limited amount +of 3D medical data given the large size of such models. In this paper, we +present a novel 3D transformer-based architecture using a deformable patch +location module to improve the differential diagnosis of Alzheimer's disease +and Frontotemporal dementia. Moreover, to overcome the problem of data +scarcity, we propose an efficient combination of various data augmentation +techniques, adapted for training transformer-based models on 3D structural +magnetic resonance imaging data. Finally, we propose to combine our +transformer-based model with a traditional machine learning model using brain +structure volumes to better exploit the available data. Our experiments +demonstrate the effectiveness of the proposed approach, showing competitive +results compared to state-of-the-art methods. Moreover, the deformable patch +locations can be visualized, revealing the most relevant brain regions used to +establish the diagnosis of each disease. + +
+
+
+
+
+ + ☆ SLiMe: Segment Like Me + + +
+ Significant strides have been made using large vision-language models, like +Stable Diffusion (SD), for a variety of downstream tasks, including image +editing, image correspondence, and 3D shape generation. Inspired by these +advancements, we explore leveraging these extensive vision-language models for +segmenting images at any desired granularity using as few as one annotated +sample by proposing SLiMe. SLiMe frames this problem as an optimization task. +Specifically, given a single training image and its segmentation mask, we first +extract attention maps, including our novel "weighted accumulated +self-attention map" from the SD prior. Then, using the extracted attention +maps, the text embeddings of Stable Diffusion are optimized such that, each of +them, learn about a single segmented region from the training image. These +learned embeddings then highlight the segmented region in the attention maps, +which in turn can then be used to derive the segmentation map. This enables +SLiMe to segment any real-world image during inference with the granularity of +the segmented region in the training image, using just one example. Moreover, +leveraging additional training data when available, i.e. few-shot, improves the +performance of SLiMe. We carried out a knowledge-rich set of experiments +examining various design factors and showed that SLiMe outperforms other +existing one-shot and few-shot segmentation methods. + +
+
+
+
+
+ + ☆ 3D Object Positioning Using Differentiable Multimodal Learning + + +
+ This article describes a multi-modal method using simulated Lidar data via +ray tracing and image pixel loss with differentiable rendering to optimize an +object's position with respect to an observer or some referential objects in a +computer graphics scene. Object position optimization is completed using +gradient descent with the loss function being influenced by both modalities. +Typical object placement optimization is done using image pixel loss with +differentiable rendering only, this work shows the use of a second modality +(Lidar) leads to faster convergence. This method of fusing sensor input +presents a potential usefulness for autonomous vehicles, as these methods can +be used to establish the locations of multiple actors in a scene. This article +also presents a method for the simulation of multiple types of data to be used +in the training of autonomous vehicles. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ☆ PDiscoNet: Semantically consistent part discovery for fine-grained + recognition ICCV + + +
+ Fine-grained classification often requires recognizing specific object parts, +such as beak shape and wing patterns for birds. Encouraging a fine-grained +classification model to first detect such parts and then using them to infer +the class could help us gauge whether the model is indeed looking at the right +details better than with interpretability methods that provide a single +attribution map. We propose PDiscoNet to discover object parts by using only +image-level class labels along with priors encouraging the parts to be: +discriminative, compact, distinct from each other, equivariant to rigid +transforms, and active in at least some of the images. In addition to using the +appropriate losses to encode these priors, we propose to use part-dropout, +where full part feature vectors are dropped at once to prevent a single part +from dominating in the classification, and part feature vector modulation, +which makes the information coming from each part distinct from the perspective +of the classifier. Our results on CUB, CelebA, and PartImageNet show that the +proposed method provides substantially better part discovery performance than +previous methods while not requiring any additional hyper-parameter tuning and +without penalizing the classification performance. The code is available at +https://github.com/robertdvdk/part_detection. + +
+
+ comment: 9 pages, 8 figures, ICCV +
+
+
+
+
+ + ☆ ResFields: Residual Neural Fields for Spatiotemporal Signals + + +
+ Neural fields, a category of neural networks trained to represent +high-frequency signals, have gained significant attention in recent years due +to their impressive performance in modeling complex 3D data, especially large +neural signed distance (SDFs) or radiance fields (NeRFs) via a single +multi-layer perceptron (MLP). However, despite the power and simplicity of +representing signals with an MLP, these methods still face challenges when +modeling large and complex temporal signals due to the limited capacity of +MLPs. In this paper, we propose an effective approach to address this +limitation by incorporating temporal residual layers into neural fields, dubbed +ResFields, a novel class of networks specifically designed to effectively +represent complex temporal signals. We conduct a comprehensive analysis of the +properties of ResFields and propose a matrix factorization technique to reduce +the number of trainable parameters and enhance generalization capabilities. +Importantly, our formulation seamlessly integrates with existing techniques and +consistently improves results across various challenging tasks: 2D video +approximation, dynamic shape modeling via temporal SDFs, and dynamic NeRF +reconstruction. Lastly, we demonstrate the practical utility of ResFields by +showcasing its effectiveness in capturing dynamic 3D scenes from sparse sensory +inputs of a lightweight capture system. + +
+
+ comment: Project page and code at https://markomih.github.io/ResFields/ +
+
+
+
+
+ + ☆ Detecting Manufacturing Defects in PCBs via Data-Centric Machine + Learning on Solder Paste Inspection Features + + +
+ Automated detection of defects in Printed Circuit Board (PCB) manufacturing +using Solder Paste Inspection (SPI) and Automated Optical Inspection (AOI) +machines can help improve operational efficiency and significantly reduce the +need for manual intervention. In this paper, using SPI-extracted features of 6 +million pins, we demonstrate a data-centric approach to train Machine Learning +(ML) models to detect PCB defects at three stages of PCB manufacturing. The 6 +million PCB pins correspond to 2 million components that belong to 15,387 PCBs. +Using a base extreme gradient boosting (XGBoost) ML model, we iterate on the +data pre-processing step to improve detection performance. Combining pin-level +SPI features using component and PCB IDs, we developed training instances also +at the component and PCB level. This allows the ML model to capture any +inter-pin, inter-component, or spatial effects that may not be apparent at the +pin level. Models are trained at the pin, component, and PCB levels, and the +detection results from the different models are combined to identify defective +components. + +
+
+
+
+
+ + ☆ Do We Still Need Non-Maximum Suppression? Accurate Confidence Estimates + and Implicit Duplication Modeling with IoU-Aware Calibration + + +
+ Object detectors are at the heart of many semi- and fully autonomous decision +systems and are poised to become even more indispensable. They are, however, +still lacking in accessibility and can sometimes produce unreliable +predictions. Especially concerning in this regard are the -- essentially +hand-crafted -- non-maximum suppression algorithms that lead to an obfuscated +prediction process and biased confidence estimates. We show that we can +eliminate classic NMS-style post-processing by using IoU-aware calibration. +IoU-aware calibration is a conditional Beta calibration; this makes it +parallelizable with no hyper-parameters. Instead of arbitrary cutoffs or +discounts, it implicitly accounts for the likelihood of each detection being a +duplicate and adjusts the confidence score accordingly, resulting in +empirically based precision estimates for each detection. Our extensive +experiments on diverse detection architectures show that the proposed IoU-aware +calibration can successfully model duplicate detections and improve +calibration. Compared to the standard sequential NMS and calibration approach, +our joint modeling can deliver performance gains over the best NMS-based +alternative while producing consistently better-calibrated confidence +predictions with less complexity. The +\hyperlink{https://github.com/Blueblue4/IoU-AwareCalibration}{code} for all our +experiments is publicly available. + +
+
+
+
+
+ + ☆ FArMARe: a Furniture-Aware Multi-task methodology for Recommending + Apartments based on the user interests ICCV2023 + + +
+ Nowadays, many people frequently have to search for new accommodation +options. Searching for a suitable apartment is a time-consuming process, +especially because visiting them is often mandatory to assess the truthfulness +of the advertisements found on the Web. While this process could be alleviated +by visiting the apartments in the metaverse, the Web-based recommendation +platforms are not suitable for the task. To address this shortcoming, in this +paper, we define a new problem called text-to-apartment recommendation, which +requires ranking the apartments based on their relevance to a textual query +expressing the user's interests. To tackle this problem, we introduce FArMARe, +a multi-task approach that supports cross-modal contrastive training with a +furniture-aware objective. Since public datasets related to indoor scenes do +not contain detailed descriptions of the furniture, we collect and annotate a +dataset comprising more than 6000 apartments. A thorough experimentation with +three different methods and two raw feature extraction procedures reveals the +effectiveness of FArMARe in dealing with the problem at hand. + +
+
+ comment: accepted for presentation at the ICCV2023 CV4Metaverse workshop +
+
+
+
+
+ + ☆ Character Queries: A Transformer-based Approach to On-Line Handwritten + Character Segmentation ICDAR 2023 + + +
+ On-line handwritten character segmentation is often associated with +handwriting recognition and even though recognition models include mechanisms +to locate relevant positions during the recognition process, it is typically +insufficient to produce a precise segmentation. Decoupling the segmentation +from the recognition unlocks the potential to further utilize the result of the +recognition. We specifically focus on the scenario where the transcription is +known beforehand, in which case the character segmentation becomes an +assignment problem between sampling points of the stylus trajectory and +characters in the text. Inspired by the $k$-means clustering algorithm, we view +it from the perspective of cluster assignment and present a Transformer-based +architecture where each cluster is formed based on a learned character query in +the Transformer decoder block. In order to assess the quality of our approach, +we create character segmentation ground truths for two popular on-line +handwriting datasets, IAM-OnDB and HANDS-VNOnDB, and evaluate multiple methods +on them, demonstrating that our approach achieves the overall best results. + +
+
+ comment: ICDAR 2023 Best Student Paper Award. Code available at + https://github.com/jungomi/character-queries +
+
+
+
+
+ + ☆ A Multimodal Analysis of Influencer Content on Twitter AACL 2023 + + +
+ Influencer marketing involves a wide range of strategies in which brands +collaborate with popular content creators (i.e., influencers) to leverage their +reach, trust, and impact on their audience to promote and endorse products or +services. Because followers of influencers are more likely to buy a product +after receiving an authentic product endorsement rather than an explicit direct +product promotion, the line between personal opinions and commercial content +promotion is frequently blurred. This makes automatic detection of regulatory +compliance breaches related to influencer advertising (e.g., misleading +advertising or hidden sponsorships) particularly difficult. In this work, we +(1) introduce a new Twitter (now X) dataset consisting of 15,998 influencer +posts mapped into commercial and non-commercial categories for assisting in the +automatic detection of commercial influencer content; (2) experiment with an +extensive set of predictive models that combine text and visual information +showing that our proposed cross-attention approach outperforms state-of-the-art +multimodal models; and (3) conduct a thorough analysis of strengths and +limitations of our models. We show that multimodal modeling is useful for +identifying commercial posts, reducing the amount of false positives, and +capturing relevant context that aids in the discovery of undisclosed commercial +posts. + +
+
+ comment: Accepted at AACL 2023 +
+
+
+
+
+ + ☆ Prompt-based All-in-One Image Restoration using CNNs and Transformer + + +
+ Image restoration aims to recover the high-quality images from their degraded +observations. Since most existing methods have been dedicated into single +degradation removal, they may not yield optimal results on other types of +degradations, which do not satisfy the applications in real world scenarios. In +this paper, we propose a novel data ingredient-oriented approach that leverages +prompt-based learning to enable a single model to efficiently tackle multiple +image degradation tasks. Specifically, we utilize a encoder to capture features +and introduce prompts with degradation-specific information to guide the +decoder in adaptively recovering images affected by various degradations. In +order to model the local invariant properties and non-local information for +high-quality image restoration, we combined CNNs operations and Transformers. +Simultaneously, we made several key designs in the Transformer blocks +(multi-head rearranged attention with prompts and simple-gate feed-forward +network) to reduce computational requirements and selectively determines what +information should be persevered to facilitate efficient recovery of +potentially sharp images. Furthermore, we incorporate a feature fusion +mechanism further explores the multi-scale information to improve the +aggregated features. The resulting tightly interlinked hierarchy architecture, +named as CAPTNet, despite being designed to handle different types of +degradations, extensive experiments demonstrate that our method performs +competitively to the task-specific algorithms. + +
+
+
+
+
+ + ☆ Adaptive Growth: Real-time CNN Layer Expansion + + +
+ Deep Neural Networks (DNNs) have shown unparalleled achievements in numerous +applications, reflecting their proficiency in managing vast data sets. Yet, +their static structure limits their adaptability in ever-changing environments. +This research presents a new algorithm that allows the convolutional layer of a +Convolutional Neural Network (CNN) to dynamically evolve based on data input, +while still being seamlessly integrated into existing DNNs. Instead of a rigid +architecture, our approach iteratively introduces kernels to the convolutional +layer, gauging its real-time response to varying data. This process is refined +by evaluating the layer's capacity to discern image features, guiding its +growth. Remarkably, our unsupervised method has outstripped its supervised +counterparts across diverse datasets like MNIST, Fashion-MNIST, CIFAR-10, and +CIFAR-100. It also showcases enhanced adaptability in transfer learning +scenarios. By introducing a data-driven model scalability strategy, we are +filling a void in deep learning, leading to more flexible and efficient DNNs +suited for dynamic settings. +Code:(https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version). + +
+
+ comment: Code: + https://github.com/YunjieZhu/Extensible-Convolutional-Layer-git-version +
+
+
+
+
+ + ☆ Exploring Semantic Consistency in Unpaired Image Translation to Generate + Data for Surgical Applications + + +
+ In surgical computer vision applications, obtaining labeled training data is +challenging due to data-privacy concerns and the need for expert annotation. +Unpaired image-to-image translation techniques have been explored to +automatically generate large annotated datasets by translating synthetic images +to the realistic domain. However, preserving the structure and semantic +consistency between the input and translated images presents significant +challenges, mainly when there is a distributional mismatch in the semantic +characteristics of the domains. This study empirically investigates unpaired +image translation methods for generating suitable data in surgical +applications, explicitly focusing on semantic consistency. We extensively +evaluate various state-of-the-art image translation models on two challenging +surgical datasets and downstream semantic segmentation tasks. We find that a +simple combination of structural-similarity loss and contrastive learning +yields the most promising results. Quantitatively, we show that the data +generated with this approach yields higher semantic consistency and can be used +more effectively as training data. + +
+
+
+
+
+ + ☆ Combining pre-trained Vision Transformers and CIDER for Out Of Domain + Detection + + +
+ Out-of-domain (OOD) detection is a crucial component in industrial +applications as it helps identify when a model encounters inputs that are +outside the training distribution. Most industrial pipelines rely on +pre-trained models for downstream tasks such as CNN or Vision Transformers. +This paper investigates the performance of those models on the task of +out-of-domain detection. Our experiments demonstrate that pre-trained +transformers models achieve higher detection performance out of the box. +Furthermore, we show that pre-trained ViT and CNNs can be combined with +refinement methods such as CIDER to improve their OOD detection performance +even more. Our results suggest that transformers are a promising approach for +OOD detection and set a stronger baseline for this task in many contexts + +
+
+
+
+
+ + ☆ MCM: Multi-condition Motion Synthesis Framework for Multi-scenario + + +
+ The objective of the multi-condition human motion synthesis task is to +incorporate diverse conditional inputs, encompassing various forms like text, +music, speech, and more. This endows the task with the capability to adapt +across multiple scenarios, ranging from text-to-motion and music-to-dance, +among others. While existing research has primarily focused on single +conditions, the multi-condition human motion generation remains underexplored. +In this paper, we address these challenges by introducing MCM, a novel paradigm +for motion synthesis that spans multiple scenarios under diverse conditions. +The MCM framework is able to integrate with any DDPM-like diffusion model to +accommodate multi-conditional information input while preserving its generative +capabilities. Specifically, MCM employs two-branch architecture consisting of a +main branch and a control branch. The control branch shares the same structure +as the main branch and is initialized with the parameters of the main branch, +effectively maintaining the generation ability of the main branch and +supporting multi-condition input. We also introduce a Transformer-based +diffusion model MWNet (DDPM-like) as our main branch that can capture the +spatial complexity and inter-joint correlations in motion sequences through a +channel-dimension self-attention module. Quantitative comparisons demonstrate +that our approach achieves SoTA results in both text-to-motion and competitive +results in music-to-dance tasks, comparable to task-specific methods. +Furthermore, the qualitative evaluation shows that MCM not only streamlines the +adaptation of methodologies originally designed for text-to-motion tasks to +domains like music-to-dance and speech-to-gesture, eliminating the need for +extensive network re-configurations but also enables effective multi-condition +modal control, realizing "once trained is motion need". + +
+
+
+
+
+ + ☆ SEAL: A Framework for Systematic Evaluation of Real-World + Super-Resolution + + +
+ Real-world Super-Resolution (real-SR) methods focus on dealing with diverse +real-world images and have attracted increasing attention in recent years. The +key idea is to use a complex and high-order degradation model to mimic +real-world degradations. Although they have achieved impressive results in +various scenarios, they are faced with the obstacle of evaluation. Currently, +these methods are only assessed by their average performance on a small set of +degradation cases randomly selected from a large space, which fails to provide +a comprehensive understanding of their overall performance and often yields +biased results. To overcome the limitation in evaluation, we propose SEAL, a +framework for systematic evaluation of real-SR. In particular, we cluster the +extensive degradation space to create a set of representative degradation +cases, which serves as a comprehensive test set. Next, we propose a +coarse-to-fine evaluation protocol to measure the distributed and relative +performance of real-SR methods on the test set. The protocol incorporates two +new metrics: acceptance rate (AR) and relative performance ratio (RPR), derived +from an acceptance line and an excellence line. Under SEAL, we benchmark +existing real-SR methods, obtain new observations and insights into their +performance, and develop a new strong baseline. We consider SEAL as the first +step towards creating an unbiased and comprehensive evaluation platform, which +can promote the development of real-SR. + +
+
+ comment: The source code is available at https://github.com/XPixelGroup/SEAL +
+
+
+
+
+ + ☆ Sparse 3D Reconstruction via Object-Centric Ray Sampling + + +
+ We propose a novel method for 3D object reconstruction from a sparse set of +views captured from a 360-degree calibrated camera rig. We represent the object +surface through a hybrid model that uses both an MLP-based neural +representation and a triangle mesh. A key contribution in our work is a novel +object-centric sampling scheme of the neural representation, where rays are +shared among all views. This efficiently concentrates and reduces the number of +samples used to update the neural model at each iteration. This sampling scheme +relies on the mesh representation to ensure also that samples are +well-distributed along its normals. The rendering is then performed efficiently +by a differentiable renderer. We demonstrate that this sampling scheme results +in a more effective training of the neural representation, does not require the +additional supervision of segmentation masks, yields state of the art 3D +reconstructions, and works with sparse views on the Google's Scanned Objects, +Tank and Temples and MVMC Car datasets. + +
+
+
+
+
+ + ☆ Vote2Cap-DETR++: Decoupling Localization and Describing for End-to-End + 3D Dense Captioning + + +
+ 3D dense captioning requires a model to translate its understanding of an +input 3D scene into several captions associated with different object regions. +Existing methods adopt a sophisticated "detect-then-describe" pipeline, which +builds explicit relation modules upon a 3D detector with numerous hand-crafted +components. While these methods have achieved initial success, the cascade +pipeline tends to accumulate errors because of duplicated and inaccurate box +estimations and messy 3D scenes. In this paper, we first propose Vote2Cap-DETR, +a simple-yet-effective transformer framework that decouples the decoding +process of caption generation and object localization through parallel +decoding. Moreover, we argue that object localization and description +generation require different levels of scene understanding, which could be +challenging for a shared set of queries to capture. To this end, we propose an +advanced version, Vote2Cap-DETR++, which decouples the queries into +localization and caption queries to capture task-specific features. +Additionally, we introduce the iterative spatial refinement strategy to vote +queries for faster convergence and better localization performance. We also +insert additional spatial information to the caption head for more accurate +descriptions. Without bells and whistles, extensive experiments on two commonly +used datasets, ScanRefer and Nr3D, demonstrate Vote2Cap-DETR and +Vote2Cap-DETR++ surpass conventional "detect-then-describe" methods by a large +margin. Codes will be made available at +https://github.com/ch3cook-fdu/Vote2Cap-DETR. + +
+
+
+
+
+ + ☆ Continual Evidential Deep Learning for Out-of-Distribution Detection ICCV2023 + + +
+ Uncertainty-based deep learning models have attracted a great deal of +interest for their ability to provide accurate and reliable predictions. +Evidential deep learning stands out achieving remarkable performance in +detecting out-of-distribution (OOD) data with a single deterministic neural +network. Motivated by this fact, in this paper we propose the integration of an +evidential deep learning method into a continual learning framework in order to +perform simultaneously incremental object classification and OOD detection. +Moreover, we analyze the ability of vacuity and dissonance to differentiate +between in-distribution data belonging to old classes and OOD data. The +proposed method, called CEDL, is evaluated on CIFAR-100 considering two +settings consisting of 5 and 10 tasks, respectively. From the obtained results, +we could appreciate that the proposed method, in addition to provide comparable +results in object classification with respect to the baseline, largely +outperforms OOD detection compared to several posthoc methods on three +evaluation metrics: AUROC, AUPR and FPR95. + +
+
+ comment: Accepted at Visual Continual Learning workshop (ICCV2023) +
+
+
+
+
+ + ☆ FishMOT: A Simple and Effective Method for Fish Tracking Based on IoU + Matching + + +
+ The tracking of various fish species plays a profoundly significant role in +understanding the behavior of individual fish and their groups. Present +tracking methods suffer from issues of low accuracy or poor robustness. In +order to address these concerns, this paper proposes a novel tracking approach, +named FishMOT (Fish Multiple Object Tracking). This method combines object +detection techniques with the IoU matching algorithm, thereby achieving +efficient, precise, and robust fish detection and tracking. Diverging from +other approaches, this method eliminates the need for multiple feature +extractions and identity assignments for each individual, instead directly +utilizing the output results of the detector for tracking, thereby +significantly reducing computational time and storage space. Furthermore, this +method imposes minimal requirements on factors such as video quality and +variations in individual appearance. As long as the detector can accurately +locate and identify fish, effective tracking can be achieved. This approach +enhances robustness and generalizability. Moreover, the algorithm employed in +this method addresses the issue of missed detections without relying on complex +feature matching or graph optimization algorithms. This contributes to improved +accuracy and reliability. Experimental trials were conducted in the open-source +video dataset provided by idtracker.ai, and comparisons were made with +state-of-the-art detector-based multi-object tracking methods. Additionally, +comparisons were made with idtracker.ai and TRex, two tools that demonstrate +exceptional performance in the field of animal tracking. The experimental +results demonstrate that the proposed method outperforms other approaches in +various evaluation metrics, exhibiting faster speed and lower memory +requirements. The source codes and pre-trained models are available at: +https://github.com/gakkistar/FishMOT + +
+
+
+
+
+ + ☆ Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction ICCV 2023 + + +
+ Reconstructing both objects and hands in 3D from a single RGB image is +complex. Existing methods rely on manually defined hand-object constraints in +Euclidean space, leading to suboptimal feature learning. Compared with +Euclidean space, hyperbolic space better preserves the geometric properties of +meshes thanks to its exponentially-growing space distance, which amplifies the +differences between the features based on similarity. In this work, we propose +the first precise hand-object reconstruction method in hyperbolic space, namely +Dynamic Hyperbolic Attention Network (DHANet), which leverages intrinsic +properties of hyperbolic space to learn representative features. Our method +that projects mesh and image features into a unified hyperbolic space includes +two modules, ie. dynamic hyperbolic graph convolution and image-attention +hyperbolic graph convolution. With these two modules, our method learns mesh +features with rich geometry-image multi-modal information and models better +hand-object interaction. Our method provides a promising alternative for fine +hand-object reconstruction in hyperbolic space. Extensive experiments on three +public datasets demonstrate that our method outperforms most state-of-the-art +methods. + +
+
+ comment: Accpeted by ICCV 2023 +
+
+
+
+
+ + ☆ Hierarchical-level rain image generative model based on GAN + + +
+ Autonomous vehicles are exposed to various weather during operation, which is +likely to trigger the performance limitations of the perception system, leading +to the safety of the intended functionality (SOTIF) problems. To efficiently +generate data for testing the performance of visual perception algorithms under +various weather conditions, a hierarchical-level rain image generative model, +rain conditional CycleGAN (RCCycleGAN), is constructed. RCCycleGAN is based on +the generative adversarial network (GAN) and can generate images of light, +medium, and heavy rain. Different rain intensities are introduced as labels in +conditional GAN (CGAN). Meanwhile, the model structure is optimized and the +training strategy is adjusted to alleviate the problem of mode collapse. In +addition, natural rain images of different intensities are collected and +processed for model training and validation. Compared with the two baseline +models, CycleGAN and DerainCycleGAN, the peak signal-to-noise ratio (PSNR) of +RCCycleGAN on the test dataset is improved by 2.58 dB and 0.74 dB, and the +structural similarity (SSIM) is improved by 18% and 8%, respectively. The +ablation experiments are also carried out to validate the effectiveness of the +model tuning. + +
+
+
+
+
+ + ☆ Indoor Localization Using Radio, Vision and Audio Sensors: Real-Life + Data Validation and Discussion + + +
+ This paper investigates indoor localization methods using radio, vision, and +audio sensors, respectively, in the same environment. The evaluation is based +on state-of-the-art algorithms and uses a real-life dataset. More specifically, +we evaluate a machine learning algorithm for radio-based localization with +massive MIMO technology, an ORB-SLAM3 algorithm for vision-based localization +with an RGB-D camera, and an SFS2 algorithm for audio-based localization with +microphone arrays. Aspects including localization accuracy, reliability, +calibration requirements, and potential system complexity are discussed to +analyze the advantages and limitations of using different sensors for indoor +localization tasks. The results can serve as a guideline and basis for further +development of robust and high-precision multi-sensory localization systems, +e.g., through sensor fusion and context and environment-aware adaptation. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ☆ A Non-Invasive Interpretable NAFLD Diagnostic Method Combining TCM + Tongue Features + + +
+ Non-alcoholic fatty liver disease (NAFLD) is a clinicopathological syndrome +characterized by hepatic steatosis resulting from the exclusion of alcohol and +other identifiable liver-damaging factors. It has emerged as a leading cause of +chronic liver disease worldwide. Currently, the conventional methods for NAFLD +detection are expensive and not suitable for users to perform daily +diagnostics. To address this issue, this study proposes a non-invasive and +interpretable NAFLD diagnostic method, the required user-provided indicators +are only Gender, Age, Height, Weight, Waist Circumference, Hip Circumference, +and tongue image. This method involves merging patients' physiological +indicators with tongue features, which are then input into a fusion network +named SelectorNet. SelectorNet combines attention mechanisms with feature +selection mechanisms, enabling it to autonomously learn the ability to select +important features. The experimental results show that the proposed method +achieves an accuracy of 77.22\% using only non-invasive data, and it also +provides compelling interpretability matrices. This study contributes to the +early diagnosis of NAFLD and the intelligent advancement of TCM tongue +diagnosis. The project in this paper is available at: +https://github.com/cshan-github/SelectorNet. + +
+
+
+
+
+ + ☆ M3D-NCA: Robust 3D Segmentation with Built-in Quality Control + + +
+ Medical image segmentation relies heavily on large-scale deep learning +models, such as UNet-based architectures. However, the real-world utility of +such models is limited by their high computational requirements, which makes +them impractical for resource-constrained environments such as primary care +facilities and conflict zones. Furthermore, shifts in the imaging domain can +render these models ineffective and even compromise patient safety if such +errors go undetected. To address these challenges, we propose M3D-NCA, a novel +methodology that leverages Neural Cellular Automata (NCA) segmentation for 3D +medical images using n-level patchification. Moreover, we exploit the variance +in M3D-NCA to develop a novel quality metric which can automatically detect +errors in the segmentation process of NCAs. M3D-NCA outperforms the two +magnitudes larger UNet models in hippocampus and prostate segmentation by 2% +Dice and can be run on a Raspberry Pi 4 Model B (2GB RAM). This highlights the +potential of M3D-NCA as an effective and efficient alternative for medical +image segmentation in resource-constrained environments. + +
+
+
+
+
+ + ☆ Patched Line Segment Learning for Vector Road Mapping + + +
+ This paper presents a novel approach to computing vector road maps from +satellite remotely sensed images, building upon a well-defined Patched Line +Segment (PaLiS) representation for road graphs that holds geometric +significance. Unlike prevailing methods that derive road vector representations +from satellite images using binary masks or keypoints, our method employs line +segments. These segments not only convey road locations but also capture their +orientations, making them a robust choice for representation. More precisely, +given an input image, we divide it into non-overlapping patches and predict a +suitable line segment within each patch. This strategy enables us to capture +spatial and structural cues from these patch-based line segments, simplifying +the process of constructing the road network graph without the necessity of +additional neural networks for connectivity. In our experiments, we demonstrate +how an effective representation of a road graph significantly enhances the +performance of vector road mapping on established benchmarks, without requiring +extensive modifications to the neural network architecture. Furthermore, our +method achieves state-of-the-art performance with just 6 GPU hours of training, +leading to a substantial 32-fold reduction in training costs in terms of GPU +hours. + +
+
+
+
+
+ + ☆ Towards Efficient Training with Negative Samples in Visual Tracking + + +
+ Current state-of-the-art (SOTA) methods in visual object tracking often +require extensive computational resources and vast amounts of training data, +leading to a risk of overfitting. This study introduces a more efficient +training strategy to mitigate overfitting and reduce computational +requirements. We balance the training process with a mix of negative and +positive samples from the outset, named as Joint learning with Negative samples +(JN). Negative samples refer to scenarios where the object from the template is +not present in the search region, which helps to prevent the model from simply +memorizing the target, and instead encourages it to use the template for object +location. To handle the negative samples effectively, we adopt a +distribution-based head, which modeling the bounding box as distribution of +distances to express uncertainty about the target's location in the presence of +negative samples, offering an efficient way to manage the mixed sample +training. Furthermore, our approach introduces a target-indicating token. It +encapsulates the target's precise location within the template image. This +method provides exact boundary details with negligible computational cost but +improving performance. Our model, JN-256, exhibits superior performance on +challenging benchmarks, achieving 75.8% AO on GOT-10k and 84.1% AUC on +TrackingNet. Notably, JN-256 outperforms previous SOTA trackers that utilize +larger models and higher input resolutions, even though it is trained with only +half the number of data sampled used in those works. + +
+
+
+
+
+ + ☆ A Unified Framework for Discovering Discrete Symmetries + + +
+ We consider the problem of learning a function respecting a symmetry from +among a class of symmetries. We develop a unified framework that enables +symmetry discovery across a broad range of subgroups including locally +symmetric, dihedral and cyclic subgroups. At the core of the framework is a +novel architecture composed of linear and tensor-valued functions that +expresses functions invariant to these subgroups in a principled manner. The +structure of the architecture enables us to leverage multi-armed bandit +algorithms and gradient descent to efficiently optimize over the linear and the +tensor-valued functions, respectively, and to infer the symmetry that is +ultimately learnt. We also discuss the necessity of the tensor-valued functions +in the architecture. Experiments on image-digit sum and polynomial regression +tasks demonstrate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ MAD: Modality Agnostic Distance Measure for Image Registration + + +
+ Multi-modal image registration is a crucial pre-processing step in many +medical applications. However, it is a challenging task due to the complex +intensity relationships between different imaging modalities, which can result +in large discrepancy in image appearance. The success of multi-modal image +registration, whether it is conventional or learning based, is predicated upon +the choice of an appropriate distance (or similarity) measure. Particularly, +deep learning registration algorithms lack in accuracy or even fail completely +when attempting to register data from an "unseen" modality. In this work, we +present Modality Agnostic Distance (MAD), a deep image distance}] measure that +utilises random convolutions to learn the inherent geometry of the images while +being robust to large appearance changes. Random convolutions are +geometry-preserving modules which we use to simulate an infinite number of +synthetic modalities alleviating the need for aligned paired data during +training. We can therefore train MAD on a mono-modal dataset and successfully +apply it to a multi-modal dataset. We demonstrate that not only can MAD +affinely register multi-modal images successfully, but it has also a larger +capture range than traditional measures such as Mutual Information and +Normalised Gradient Fields. + +
+
+
+
+
+ + ☆ Image Aesthetics Assessment via Learnable Queries + + +
+ Image aesthetics assessment (IAA) aims to estimate the aesthetics of images. +Depending on the content of an image, diverse criteria need to be selected to +assess its aesthetics. Existing works utilize pre-trained vision backbones +based on content knowledge to learn image aesthetics. However, training those +backbones is time-consuming and suffers from attention dispersion. Inspired by +learnable queries in vision-language alignment, we propose the Image Aesthetics +Assessment via Learnable Queries (IAA-LQ) approach. It adapts learnable queries +to extract aesthetic features from pre-trained image features obtained from a +frozen image encoder. Extensive experiments on real-world data demonstrate the +advantages of IAA-LQ, beating the best state-of-the-art method by 2.2% and 2.1% +in terms of SRCC and PLCC, respectively. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Bandwidth-efficient Inference for Neural Image Compression ICASSP 2024 + + +
+ With neural networks growing deeper and feature maps growing larger, limited +communication bandwidth with external memory (or DRAM) and power constraints +become a bottleneck in implementing network inference on mobile and edge +devices. In this paper, we propose an end-to-end differentiable bandwidth +efficient neural inference method with the activation compressed by neural data +compression method. Specifically, we propose a transform-quantization-entropy +coding pipeline for activation compression with symmetric exponential Golomb +coding and a data-dependent Gaussian entropy model for arithmetic coding. +Optimized with existing model quantization methods, low-level task of image +compression can achieve up to 19x bandwidth reduction with 6.21x energy saving. + +
+
+ comment: 9 pages, 6 figures, submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Knowledge Distillation Layer that Lets the Student Decide BMVC 2023 + + +
+ Typical technique in knowledge distillation (KD) is regularizing the learning +of a limited capacity model (student) by pushing its responses to match a +powerful model's (teacher). Albeit useful especially in the penultimate layer +and beyond, its action on student's feature transform is rather implicit, +limiting its practice in the intermediate layers. To explicitly embed the +teacher's knowledge in feature transform, we propose a learnable KD layer for +the student which improves KD with two distinct abilities: i) learning how to +leverage the teacher's knowledge, enabling to discard nuisance information, and +ii) feeding forward the transferred knowledge deeper. Thus, the student enjoys +the teacher's knowledge during the inference besides training. Formally, we +repurpose 1x1-BN-ReLU-1x1 convolution block to assign a semantic vector to each +local region according to the template (supervised by the teacher) that the +corresponding region of the student matches. To facilitate template learning in +the intermediate layers, we propose a novel form of supervision based on the +teacher's decisions. Through rigorous experimentation, we demonstrate the +effectiveness of our approach on 3 popular classification benchmarks. Code is +available at: https://github.com/adagorgun/letKD-framework + +
+
+ comment: Accepted at the British Machine Vision Conference 2023 (BMVC 2023) +
+
+
+
+
+ + ☆ Adjacency-hopping de Bruijn Sequences for Non-repetitive Coding + + +
+ A special type of cyclic sequences named adjacency-hopping de Bruijn +sequences is introduced in this paper. It is theoretically proved the existence +of such sequences, and the number of such sequences is derived. These sequences +guarantee that all neighboring codes are different while retaining the +uniqueness of subsequences, which is a significant characteristic of original +de Bruijn sequences in coding and matching. At last, the adjacency-hopping de +Bruijn sequences are applied to structured light coding, and a color fringe +pattern coded by such a sequence is presented. In summary, the proposed +sequences demonstrate significant advantages in structured light coding by +virtue of the uniqueness of subsequences and the adjacency-hopping +characteristic, and show potential for extension to other fields with similar +requirements of non-repetitive coding and efficient matching. + +
+
+
+
+
+ + ☆ Image-Object-Specific Prompt Learning for Few-Shot Class-Incremental + Learning + + +
+ While many FSCIL studies have been undertaken, achieving satisfactory +performance, especially during incremental sessions, has remained challenging. +One prominent challenge is that the encoder, trained with an ample base session +training set, often underperforms in incremental sessions. In this study, we +introduce a novel training framework for FSCIL, capitalizing on the +generalizability of the Contrastive Language-Image Pre-training (CLIP) model to +unseen classes. We achieve this by formulating image-object-specific (IOS) +classifiers for the input images. Here, an IOS classifier refers to one that +targets specific attributes (like wings or wheels) of class objects rather than +the image's background. To create these IOS classifiers, we encode a bias +prompt into the classifiers using our specially designed module, which +harnesses key-prompt pairs to pinpoint the IOS features of classes in each +session. From an FSCIL standpoint, our framework is structured to retain +previous knowledge and swiftly adapt to new sessions without forgetting or +overfitting. This considers the updatability of modules in each session and +some tricks empirically found for fast convergence. Our approach consistently +demonstrates superior performance compared to state-of-the-art methods across +the miniImageNet, CIFAR100, and CUB200 datasets. Further, we provide additional +experiments to validate our learned model's ability to achieve IOS classifiers. +We also conduct ablation studies to analyze the impact of each module within +the architecture. + +
+
+ comment: 8 pages, 4 figures, 4 tables +
+
+
+
+
+ + ☆ 3D Trajectory Reconstruction of Drones using a Single Camera + + +
+ Drones have been widely utilized in various fields, but the number of drones +being used illegally and for hazardous purposes has increased recently. To +prevent those illegal drones, in this work, we propose a novel framework for +reconstructing 3D trajectories of drones using a single camera. By leveraging +calibrated cameras, we exploit the relationship between 2D and 3D spaces. We +automatically track the drones in 2D images using the drone tracker and +estimate their 2D rotations. By combining the estimated 2D drone positions with +their actual length information and camera parameters, we geometrically infer +the 3D trajectories of the drones. To address the lack of public drone +datasets, we also create synthetic 2D and 3D drone datasets. The experimental +results show that the proposed methods accurately reconstruct drone +trajectories in 3D space, and demonstrate the potential of our framework for +single camera-based surveillance systems. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ Improving diagnosis and prognosis of lung cancer using vision + transformers: A scoping review + + +
+ Vision transformer-based methods are advancing the field of medical +artificial intelligence and cancer imaging, including lung cancer applications. +Recently, many researchers have developed vision transformer-based AI methods +for lung cancer diagnosis and prognosis. This scoping review aims to identify +the recent developments on vision transformer-based AI methods for lung cancer +imaging applications. It provides key insights into how vision transformers +complemented the performance of AI and deep learning methods for lung cancer. +Furthermore, the review also identifies the datasets that contributed to +advancing the field. Of the 314 retrieved studies, this review included 34 +studies published from 2020 to 2022. The most commonly addressed task in these +studies was the classification of lung cancer types, such as lung squamous cell +carcinoma versus lung adenocarcinoma, and identifying benign versus malignant +pulmonary nodules. Other applications included survival prediction of lung +cancer patients and segmentation of lungs. The studies lacked clear strategies +for clinical transformation. SWIN transformer was a popular choice of the +researchers; however, many other architectures were also reported where vision +transformer was combined with convolutional neural networks or UNet model. It +can be concluded that vision transformer-based models are increasingly in +popularity for developing AI methods for lung cancer applications. However, +their computational complexity and clinical relevance are important factors to +be considered for future research work. This review provides valuable insights +for researchers in the field of AI and healthcare to advance the +state-of-the-art in lung cancer diagnosis and prognosis. We provide an +interactive dashboard on lung-cancer.onrender.com/. + +
+
+ comment: submitted to BMC Medical Imaging journal +
+
+
+
+
+ + ☆ LightNeuS: Neural Surface Reconstruction in Endoscopy using Illumination + Decline MICCAI 2023 + + +
+ We propose a new approach to 3D reconstruction from sequences of images +acquired by monocular endoscopes. It is based on two key insights. First, +endoluminal cavities are watertight, a property naturally enforced by modeling +them in terms of a signed distance function. Second, the scene illumination is +variable. It comes from the endoscope's light sources and decays with the +inverse of the squared distance to the surface. To exploit these insights, we +build on NeuS, a neural implicit surface reconstruction technique with an +outstanding capability to learn appearance and a SDF surface model from +multiple views, but currently limited to scenes with static illumination. To +remove this limitation and exploit the relation between pixel brightness and +depth, we modify the NeuS architecture to explicitly account for it and +introduce a calibrated photometric model of the endoscope's camera and light +source. Our method is the first one to produce watertight reconstructions of +whole colon sections. We demonstrate excellent accuracy on phantom imagery. +Remarkably, the watertight prior combined with illumination decline, allows to +complete the reconstruction of unseen portions of the surface with acceptable +accuracy, paving the way to automatic quality assessment of cancer screening +explorations, measuring the global percentage of observed mucosa. + +
+
+ comment: 12 pages, 7 figures, 1 table, submitted to MICCAI 2023 +
+
+
+
+
+ + ☆ Diffusion Model is Secretly a Training-free Open Vocabulary Semantic + Segmenter + + +
+ Recent research has explored the utilization of pre-trained text-image +discriminative models, such as CLIP, to tackle the challenges associated with +open-vocabulary semantic segmentation. However, it is worth noting that the +alignment process based on contrastive learning employed by these models may +unintentionally result in the loss of crucial localization information and +object completeness, which are essential for achieving accurate semantic +segmentation. More recently, there has been an emerging interest in extending +the application of diffusion models beyond text-to-image generation tasks, +particularly in the domain of semantic segmentation. These approaches utilize +diffusion models either for generating annotated data or for extracting +features to facilitate semantic segmentation. This typically involves training +segmentation models by generating a considerable amount of synthetic data or +incorporating additional mask annotations. To this end, we uncover the +potential of generative text-to-image conditional diffusion models as highly +efficient open-vocabulary semantic segmenters, and introduce a novel +training-free approach named DiffSegmenter. Specifically, by feeding an input +image and candidate classes into an off-the-shelf pre-trained conditional +latent diffusion model, the cross-attention maps produced by the denoising +U-Net are directly used as segmentation scores, which are further refined and +completed by the followed self-attention maps. Additionally, we carefully +design effective textual prompts and a category filtering mechanism to further +enhance the segmentation results. Extensive experiments on three benchmark +datasets show that the proposed DiffSegmenter achieves impressive results for +open-vocabulary semantic segmentation. + +
+
+
+
+
+ + ☆ MLN-net: A multi-source medical image segmentation method for clustered + microcalcifications using multiple layer normalization + + +
+ Accurate segmentation of clustered microcalcifications in mammography is +crucial for the diagnosis and treatment of breast cancer. Despite exhibiting +expert-level accuracy, recent deep learning advancements in medical image +segmentation provide insufficient contribution to practical applications, due +to the domain shift resulting from differences in patient postures, individual +gland density, and imaging modalities of mammography etc. In this paper, a +novel framework named MLN-net, which can accurately segment multi-source images +using only single source images, is proposed for clustered microcalcification +segmentation. We first propose a source domain image augmentation method to +generate multi-source images, leading to improved generalization. And a +structure of multiple layer normalization (LN) layers is used to construct the +segmentation network, which can be found efficient for clustered +microcalcification segmentation in different domains. Additionally, a branch +selection strategy is designed for measuring the similarity of the source +domain data and the target domain data. To validate the proposed MLN-net, +extensive analyses including ablation experiments are performed, comparison of +12 baseline methods. Extensive experiments validate the effectiveness of +MLN-net in segmenting clustered microcalcifications from different domains and +the its segmentation accuracy surpasses state-of-the-art methods. Code will be +available at https://github.com/yezanting/MLN-NET-VERSON1. + +
+
+ comment: 17 pages, 9 figures, 3 tables +
+
+
+
+
+ + ☆ DMKD: Improving Feature-based Knowledge Distillation for Object + Detection Via Dual Masking Augmentation + + +
+ Recent mainstream masked distillation methods function by reconstructing +selectively masked areas of a student network from the feature map of its +teacher counterpart. In these methods, the masked regions need to be properly +selected, such that reconstructed features encode sufficient discrimination and +representation capability like the teacher feature. However, previous masked +distillation methods only focus on spatial masking, making the resulting masked +areas biased towards spatial importance without encoding informative channel +clues. In this study, we devise a Dual Masked Knowledge Distillation (DMKD) +framework which can capture both spatially important and channel-wise +informative clues for comprehensive masked feature reconstruction. More +specifically, we employ dual attention mechanism for guiding the respective +masking branches, leading to reconstructed feature encoding dual significance. +Furthermore, fusing the reconstructed features is achieved by self-adjustable +weighting strategy for effective feature distillation. Our experiments on +object detection task demonstrate that the student networks achieve performance +gains of 4.1% and 4.3% with the help of our method when RetinaNet and Cascade +Mask R-CNN are respectively used as the teacher networks, while outperforming +the other state-of-the-art distillation methods. + +
+
+
+
+
+ + ☆ SlAction: Non-intrusive, Lightweight Obstructive Sleep Apnea Detection + using Infrared Video ICCV + + +
+ Obstructive sleep apnea (OSA) is a prevalent sleep disorder affecting +approximately one billion people world-wide. The current gold standard for +diagnosing OSA, Polysomnography (PSG), involves an overnight hospital stay with +multiple attached sensors, leading to potential inaccuracies due to the +first-night effect. To address this, we present SlAction, a non-intrusive OSA +detection system for daily sleep environments using infrared videos. +Recognizing that sleep videos exhibit minimal motion, this work investigates +the fundamental question: "Are respiratory events adequately reflected in human +motions during sleep?" Analyzing the largest sleep video dataset of 5,098 +hours, we establish correlations between OSA events and human motions during +sleep. Our approach uses a low frame rate (2.5 FPS), a large size (60 seconds) +and step (30 seconds) for sliding window analysis to capture slow and long-term +motions related to OSA. Furthermore, we utilize a lightweight deep neural +network for resource-constrained devices, ensuring all video streams are +processed locally without compromising privacy. Evaluations show that SlAction +achieves an average F1 score of 87.6% in detecting OSA across various +environments. Implementing SlAction on NVIDIA Jetson Nano enables real-time +inference (~3 seconds for a 60-second video clip), highlighting its potential +for early detection and personalized treatment of OSA. + +
+
+ comment: Accepted to ICCV CVAMD 2023, poster +
+
+
+
+
+ + ☆ Gene-induced Multimodal Pre-training for Image-omic Classification + + +
+ Histology analysis of the tumor micro-environment integrated with genomic +assays is the gold standard for most cancers in modern medicine. This paper +proposes a Gene-induced Multimodal Pre-training (GiMP) framework, which jointly +incorporates genomics and Whole Slide Images (WSIs) for classification tasks. +Our work aims at dealing with the main challenges of multi-modality image-omic +classification w.r.t. (1) the patient-level feature extraction difficulties +from gigapixel WSIs and tens of thousands of genes, and (2) effective fusion +considering high-order relevance modeling. Concretely, we first propose a group +multi-head self-attention gene encoder to capture global structured features in +gene expression cohorts. We design a masked patch modeling paradigm (MPM) to +capture the latent pathological characteristics of different tissues. The mask +strategy is randomly masking a fixed-length contiguous subsequence of patch +embeddings of a WSI. Finally, we combine the classification tokens of paired +modalities and propose a triplet learning module to learn high-order relevance +and discriminative patient-level information.After pre-training, a simple +fine-tuning can be adopted to obtain the classification results. Experimental +results on the TCGA dataset show the superiority of our network architectures +and our pre-training framework, achieving 99.47% in accuracy for image-omic +classification. The code is publicly available at +https://github.com/huangwudiduan/GIMP. + +
+
+
+
+
+ + ☆ A Joint Study of Phrase Grounding and Task Performance in Vision and + Language Models + + +
+ Key to tasks that require reasoning about natural language in visual contexts +is grounding words and phrases to image regions. However, observing this +grounding in contemporary models is complex, even if it is generally expected +to take place if the task is addressed in a way that is conductive to +generalization. We propose a framework to jointly study task performance and +phrase grounding, and propose three benchmarks to study the relation between +the two. Our results show that contemporary models demonstrate inconsistency +between their ability to ground phrases and solve tasks. We show how this can +be addressed through brute-force training on ground phrasing annotations, and +analyze the dynamics it creates. Code and at available at +https://github.com/lil-lab/phrase_grounding. + +
+
+
+
+
+ + ☆ Improving Image Classification of Knee Radiographs: An Automated Image + Labeling Approach + + +
+ Large numbers of radiographic images are available in knee radiology +practices which could be used for training of deep learning models for +diagnosis of knee abnormalities. However, those images do not typically contain +readily available labels due to limitations of human annotations. The purpose +of our study was to develop an automated labeling approach that improves the +image classification model to distinguish normal knee images from those with +abnormalities or prior arthroplasty. The automated labeler was trained on a +small set of labeled data to automatically label a much larger set of unlabeled +data, further improving the image classification performance for knee +radiographic diagnosis. We developed our approach using 7,382 patients and +validated it on a separate set of 637 patients. The final image classification +model, trained using both manually labeled and pseudo-labeled data, had the +higher weighted average AUC (WAUC: 0.903) value and higher AUC-ROC values among +all classes (normal AUC-ROC: 0.894; abnormal AUC-ROC: 0.896, arthroplasty +AUC-ROC: 0.990) compared to the baseline model (WAUC=0.857; normal AUC-ROC: +0.842; abnormal AUC-ROC: 0.848, arthroplasty AUC-ROC: 0.987), trained using +only manually labeled data. DeLong tests show that the improvement is +significant on normal (p-value<0.002) and abnormal (p-value<0.001) images. Our +findings demonstrated that the proposed automated labeling approach +significantly improves the performance of image classification for radiographic +knee diagnosis, allowing for facilitating patient care and curation of large +knee datasets. + +
+
+ comment: This is the preprint version +
+
+
+
+
+ + ☆ Efficient Training for Visual Tracking with Deformable Transformer + + +
+ Recent Transformer-based visual tracking models have showcased superior +performance. Nevertheless, prior works have been resource-intensive, requiring +prolonged GPU training hours and incurring high GFLOPs during inference due to +inefficient training methods and convolution-based target heads. This intensive +resource use renders them unsuitable for real-world applications. In this +paper, we present DETRack, a streamlined end-to-end visual object tracking +framework. Our framework utilizes an efficient encoder-decoder structure where +the deformable transformer decoder acting as a target head, achieves higher +sparsity than traditional convolution heads, resulting in decreased GFLOPs. For +training, we introduce a novel one-to-many label assignment and an auxiliary +denoising technique, significantly accelerating model's convergence. +Comprehensive experiments affirm the effectiveness and efficiency of our +proposed method. For instance, DETRack achieves 72.9% AO on challenging GOT-10k +benchmarks using only 20% of the training epochs required by the baseline, and +runs with lower GFLOPs than all the transformer-based trackers. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.16580 by other authors +
+
+
+
+
+ + ☆ Progressive Attention Guidance for Whole Slide Vulvovaginal Candidiasis + Screening MICCAI 2023 + + +
+ Vulvovaginal candidiasis (VVC) is the most prevalent human candidal +infection, estimated to afflict approximately 75% of all women at least once in +their lifetime. It will lead to several symptoms including pruritus, vaginal +soreness, and so on. Automatic whole slide image (WSI) classification is highly +demanded, for the huge burden of disease control and prevention. However, the +WSI-based computer-aided VCC screening method is still vacant due to the scarce +labeled data and unique properties of candida. Candida in WSI is challenging to +be captured by conventional classification models due to its distinctive +elongated shape, the small proportion of their spatial distribution, and the +style gap from WSIs. To make the model focus on the candida easier, we propose +an attention-guided method, which can obtain a robust diagnosis classification +model. Specifically, we first use a pre-trained detection model as prior +instruction to initialize the classification model. Then we design a Skip +Self-Attention module to refine the attention onto the fined-grained features +of candida. Finally, we use a contrastive learning method to alleviate the +overfitting caused by the style gap of WSIs and suppress the attention to false +positive regions. Our experimental results demonstrate that our framework +achieves state-of-the-art performance. Code and example data are available at +https://github.com/cjdbehumble/MICCAI2023-VVC-Screening. + +
+
+ comment: Accepted in the main conference MICCAI 2023 +
+
+
+
+
+ + ☆ Fast and Resource-Efficient Object Tracking on Edge Devices: A + Measurement Study + + +
+ Object tracking is an important functionality of edge video analytic systems +and services. Multi-object tracking (MOT) detects the moving objects and tracks +their locations frame by frame as real scenes are being captured into a video. +However, it is well known that real time object tracking on the edge poses +critical technical challenges, especially with edge devices of heterogeneous +computing resources. This paper examines the performance issues and +edge-specific optimization opportunities for object tracking. We will show that +even the well trained and optimized MOT model may still suffer from random +frame dropping problems when edge devices have insufficient computation +resources. We present several edge specific performance optimization +strategies, collectively coined as EMO, to speed up the real time object +tracking, ranging from window-based optimization to similarity based +optimization. Extensive experiments on popular MOT benchmarks demonstrate that +our EMO approach is competitive with respect to the representative methods for +on-device object tracking techniques in terms of run-time performance and +tracking accuracy. EMO is released on Github at +https://github.com/git-disl/EMO. + +
+
+
+
+
+ + ☆ Multiclass Alignment of Confidence and Certainty for Network Calibration + + +
+ Deep neural networks (DNNs) have made great strides in pushing the +state-of-the-art in several challenging domains. Recent studies reveal that +they are prone to making overconfident predictions. This greatly reduces the +overall trust in model predictions, especially in safety-critical applications. +Early work in improving model calibration employs post-processing techniques +which rely on limited parameters and require a hold-out set. Some recent +train-time calibration methods, which involve all model parameters, can +outperform the postprocessing methods. To this end, we propose a new train-time +calibration method, which features a simple, plug-and-play auxiliary loss known +as multi-class alignment of predictive mean confidence and predictive certainty +(MACC). It is based on the observation that a model miscalibration is directly +related to its predictive certainty, so a higher gap between the mean +confidence and certainty amounts to a poor calibration both for in-distribution +and out-of-distribution predictions. Armed with this insight, our proposed loss +explicitly encourages a confident (or underconfident) model to also provide a +low (or high) spread in the presoftmax distribution. Extensive experiments on +ten challenging datasets, covering in-domain, out-domain, non-visual +recognition and medical image classification scenarios, show that our method +achieves state-of-the-art calibration performance for both in-domain and +out-domain predictions. Our code and models will be publicly released. + +
+
+ comment: Accepted at GCPR 2023 +
+
+
+
+
+ + ☆ Distribution-Aware Prompt Tuning for Vision-Language Models ICCV2023 + + +
+ Pre-trained vision-language models (VLMs) have shown impressive performance +on various downstream tasks by utilizing knowledge learned from large data. In +general, the performance of VLMs on target tasks can be further improved by +prompt tuning, which adds context to the input image or text. By leveraging +data from target tasks, various prompt-tuning methods have been studied in the +literature. A key to prompt tuning is the feature space alignment between two +modalities via learnable vectors with model parameters fixed. We observed that +the alignment becomes more effective when embeddings of each modality are +`well-arranged' in the latent space. Inspired by this observation, we proposed +distribution-aware prompt tuning (DAPT) for vision-language models, which is +simple yet effective. Specifically, the prompts are learned by maximizing +inter-dispersion, the distance between classes, as well as minimizing the +intra-dispersion measured by the distance between embeddings from the same +class. Our extensive experiments on 11 benchmark datasets demonstrate that our +method significantly improves generalizability. The code is available at +https://github.com/mlvlab/DAPT. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ Reasonable Anomaly Detection in Long Sequences + + +
+ Video anomaly detection is a challenging task due to the lack in approaches +for representing samples. The visual representations of most existing +approaches are limited by short-term sequences of observations which cannot +provide enough clues for achieving reasonable detections. In this paper, we +propose to completely represent the motion patterns of objects by learning from +long-term sequences. Firstly, a Stacked State Machine (SSM) model is proposed +to represent the temporal dependencies which are consistent across long-range +observations. Then SSM model functions in predicting future states based on +past ones, the divergence between the predictions with inherent normal patterns +and observed ones determines anomalies which violate normal motion patterns. +Extensive experiments are carried out to evaluate the proposed approach on the +dataset and existing ones. Improvements over state-of-the-art methods can be +observed. Our code is available at +https://github.com/AllenYLJiang/Anomaly-Detection-in-Sequences. + +
+
+ comment: 8 pages, 1 figure +
+
+
+
+
+ + ☆ A novel method for iris recognition using BP neural network and parallel + computing by the aid of GPUs (Graphics Processing Units) + + +
+ In this paper, we seek a new method in designing an iris recognition system. +In this method, first the Haar wavelet features are extracted from iris images. +The advantage of using these features is the high-speed extraction, as well as +being unique to each iris. Then the back propagation neural network (BPNN) is +used as a classifier. In this system, the BPNN parallel algorithms and their +implementation on GPUs have been used by the aid of CUDA in order to speed up +the learning process. Finally, the system performance and the speeding outcomes +in a way that this algorithm is done in series are presented. + +
+
+ comment: 8 pages, +
+
+
+
+
+ + ☆ Kidney abnormality segmentation in thorax-abdomen CT scans + + +
+ In this study, we introduce a deep learning approach for segmenting kidney +parenchyma and kidney abnormalities to support clinicians in identifying and +quantifying renal abnormalities such as cysts, lesions, masses, metastases, and +primary tumors. Our end-to-end segmentation method was trained on 215 +contrast-enhanced thoracic-abdominal CT scans, with half of these scans +containing one or more abnormalities. + We began by implementing our own version of the original 3D U-Net network and +incorporated four additional components: an end-to-end multi-resolution +approach, a set of task-specific data augmentations, a modified loss function +using top-$k$, and spatial dropout. Furthermore, we devised a tailored +post-processing strategy. Ablation studies demonstrated that each of the four +modifications enhanced kidney abnormality segmentation performance, while three +out of four improved kidney parenchyma segmentation. Subsequently, we trained +the nnUNet framework on our dataset. By ensembling the optimized 3D U-Net and +the nnUNet with our specialized post-processing, we achieved marginally +superior results. + Our best-performing model attained Dice scores of 0.965 and 0.947 for +segmenting kidney parenchyma in two test sets (20 scans without abnormalities +and 30 with abnormalities), outperforming an independent human observer who +scored 0.944 and 0.925, respectively. In segmenting kidney abnormalities within +the 30 test scans containing them, the top-performing method achieved a Dice +score of 0.585, while an independent second human observer reached a score of +0.664, suggesting potential for further improvement in computerized methods. + All training data is available to the research community under a CC-BY 4.0 +license on https://doi.org/10.5281/zenodo.8014289 + +
+
+
+
+
+ + ☆ Active shooter detection and robust tracking utilizing supplemental + synthetic data + + +
+ The increasing concern surrounding gun violence in the United States has led +to a focus on developing systems to improve public safety. One approach to +developing such a system is to detect and track shooters, which would help +prevent or mitigate the impact of violent incidents. In this paper, we proposed +detecting shooters as a whole, rather than just guns, which would allow for +improved tracking robustness, as obscuring the gun would no longer cause the +system to lose sight of the threat. However, publicly available data on +shooters is much more limited and challenging to create than a gun dataset +alone. Therefore, we explore the use of domain randomization and transfer +learning to improve the effectiveness of training with synthetic data obtained +from Unreal Engine environments. This enables the model to be trained on a +wider range of data, increasing its ability to generalize to different +situations. Using these techniques with YOLOv8 and Deep OC-SORT, we implemented +an initial version of a shooter tracking system capable of running on edge +hardware, including both a Raspberry Pi and a Jetson Nano. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Self-Supervised Masked Digital Elevation Models Encoding for + Low-Resource Downstream Tasks + + +
+ The lack of quality labeled data is one of the main bottlenecks for training +Deep Learning models. As the task increases in complexity, there is a higher +penalty for overfitting and unstable learning. The typical paradigm employed +today is Self-Supervised learning, where the model attempts to learn from a +large corpus of unstructured and unlabeled data and then transfer that +knowledge to the required task. Some notable examples of self-supervision in +other modalities are BERT for Large Language Models, Wav2Vec for Speech +Recognition, and the Masked AutoEncoder for Vision, which all utilize +Transformers to solve a masked prediction task. GeoAI is uniquely poised to +take advantage of the self-supervised methodology due to the decades of data +collected, little of which is precisely and dependably annotated. Our goal is +to extract building and road segmentations from Digital Elevation Models (DEM) +that provide a detailed topography of the earths surface. The proposed +architecture is the Masked Autoencoder pre-trained on ImageNet (with the +limitation that there is a large domain discrepancy between ImageNet and DEM) +with an UperNet Head for decoding segmentations. We tested this model with 450 +and 50 training images only, utilizing roughly 5% and 0.5% of the original data +respectively. On the building segmentation task, this model obtains an 82.1% +Intersection over Union (IoU) with 450 Images and 69.1% IoU with only 50 +images. On the more challenging road detection task the model obtains an 82.7% +IoU with 450 images and 73.2% IoU with only 50 images. Any hand-labeled dataset +made today about the earths surface will be immediately obsolete due to the +constantly changing nature of the landscape. This motivates the clear necessity +for data-efficient learners that can be used for a wide variety of downstream +tasks. + +
+
+
+
+
+ + ☆ ViewMix: Augmentation for Robust Representation in Self-Supervised + Learning + + +
+ Joint Embedding Architecture-based self-supervised learning methods have +attributed the composition of data augmentations as a crucial factor for their +strong representation learning capabilities. While regional dropout strategies +have proven to guide models to focus on lesser indicative parts of the objects +in supervised methods, it hasn't been adopted by self-supervised methods for +generating positive pairs. This is because the regional dropout methods are not +suitable for the input sampling process of the self-supervised methodology. +Whereas dropping informative pixels from the positive pairs can result in +inefficient training, replacing patches of a specific object with a different +one can steer the model from maximizing the agreement between different +positive pairs. Moreover, joint embedding representation learning methods have +not made robustness their primary training outcome. To this end, we propose the +ViewMix augmentation policy, specially designed for self-supervised learning, +upon generating different views of the same image, patches are cut and pasted +from one view to another. By leveraging the different views created by this +augmentation strategy, multiple joint embedding-based self-supervised +methodologies obtained better localization capability and consistently +outperformed their corresponding baseline methods. It is also demonstrated that +incorporating ViewMix augmentation policy promotes robustness of the +representations in the state-of-the-art methods. Furthermore, our +experimentation and analysis of compute times suggest that ViewMix augmentation +doesn't introduce any additional overhead compared to other counterparts. + +
+
+
+
+
+ + ☆ Source Camera Identification and Detection in Digital Videos through + Blind Forensics + + +
+ Source camera identification in digital videos is the problem of associating +an unknown digital video with its source device, within a closed set of +possible devices. The existing techniques in source detection of digital videos +try to find a fingerprint of the actual source in the video in form of PRNU +(Photo Response Non--Uniformity), and match it against the SPN (Sensor Pattern +Noise) of each possible device. The highest correlation indicates the correct +source. We investigate the problem of identifying a video source through a +feature based approach using machine learning. In this paper, we present a +blind forensic technique of video source authentication and identification, +based on feature extraction, feature selection and subsequent source +classification. The main aim is to determine whether a claimed source for a +video is actually its original source. If not, we identify its original source. +Our experimental results prove the efficiency of the proposed method compared +to traditional fingerprint based technique. + +
+
+ comment: Submitted to IEEE for inclusion in Xplore- Digital Library. Paper + presented at the International Conference on Recent Trends in Computational + Engineering & Technologies (ICRTCET 18)with Paper Id: ICRTCET-227 +
+
+
+
+
+ + ☆ Using Neural Networks for Fast SAR Roughness Estimation of High + Resolution Images + + +
+ The analysis of Synthetic Aperture Radar (SAR) imagery is an important step +in remote sensing applications, and it is a challenging problem due to its +inherent speckle noise. One typical solution is to model the data using the +$G_I^0$ distribution and extract its roughness information, which in turn can +be used in posterior imaging tasks, such as segmentation, classification and +interpretation. This leads to the need of quick and reliable estimation of the +roughness parameter from SAR data, especially with high resolution images. +Unfortunately, traditional parameter estimation procedures are slow and prone +to estimation failures. In this work, we proposed a neural network-based +estimation framework that first learns how to predict underlying parameters of +$G_I^0$ samples and then can be used to estimate the roughness of unseen data. +We show that this approach leads to an estimator that is quicker, yields less +estimation error and is less prone to failures than the traditional estimation +procedures for this problem, even when we use a simple network. More +importantly, we show that this same methodology can be generalized to handle +image inputs and, even if trained on purely synthetic data for a few seconds, +is able to perform real time pixel-wise roughness estimation for high +resolution real SAR imagery. + +
+
+
+
+
+ + ☆ SADIR: Shape-Aware Diffusion Models for 3D Image Reconstruction MICCAI 2023 + + +
+ 3D image reconstruction from a limited number of 2D images has been a +long-standing challenge in computer vision and image analysis. While deep +learning-based approaches have achieved impressive performance in this area, +existing deep networks often fail to effectively utilize the shape structures +of objects presented in images. As a result, the topology of reconstructed +objects may not be well preserved, leading to the presence of artifacts such as +discontinuities, holes, or mismatched connections between different parts. In +this paper, we propose a shape-aware network based on diffusion models for 3D +image reconstruction, named SADIR, to address these issues. In contrast to +previous methods that primarily rely on spatial correlations of image +intensities for 3D reconstruction, our model leverages shape priors learned +from the training data to guide the reconstruction process. To achieve this, we +develop a joint learning network that simultaneously learns a mean shape under +deformation models. Each reconstructed image is then considered as a deformed +variant of the mean shape. We validate our model, SADIR, on both brain and +cardiac magnetic resonance images (MRIs). Experimental results show that our +method outperforms the baselines with lower reconstruction error and better +preservation of the shape structure of objects within the images. + +
+
+ comment: ShapeMI MICCAI 2023: Workshop on Shape in Medical Imaging +
+
+
+
+
+ + ☆ Expert Uncertainty and Severity Aware Chest X-Ray Classification by + Multi-Relationship Graph Learning + + +
+ Patients undergoing chest X-rays (CXR) often endure multiple lung diseases. +When evaluating a patient's condition, due to the complex pathologies, subtle +texture changes of different lung lesions in images, and patient condition +differences, radiologists may make uncertain even when they have experienced +long-term clinical training and professional guidance, which makes much noise +in extracting disease labels based on CXR reports. In this paper, we re-extract +disease labels from CXR reports to make them more realistic by considering +disease severity and uncertainty in classification. Our contributions are as +follows: 1. We re-extracted the disease labels with severity and uncertainty by +a rule-based approach with keywords discussed with clinical experts. 2. To +further improve the explainability of chest X-ray diagnosis, we designed a +multi-relationship graph learning method with an expert uncertainty-aware loss +function. 3. Our multi-relationship graph learning method can also interpret +the disease classification results. Our experimental results show that models +considering disease severity and uncertainty outperform previous +state-of-the-art methods. + +
+
+
+
+
+ + ☆ MEGANet: Multi-Scale Edge-Guided Attention Network for Weak Boundary + Polyp Segmentation + + +
+ Efficient polyp segmentation in healthcare plays a critical role in enabling +early diagnosis of colorectal cancer. However, the segmentation of polyps +presents numerous challenges, including the intricate distribution of +backgrounds, variations in polyp sizes and shapes, and indistinct boundaries. +Defining the boundary between the foreground (i.e. polyp itself) and the +background (surrounding tissue) is difficult. To mitigate these challenges, we +propose Multi-Scale Edge-Guided Attention Network (MEGANet) tailored +specifically for polyp segmentation within colonoscopy images. This network +draws inspiration from the fusion of a classical edge detection technique with +an attention mechanism. By combining these techniques, MEGANet effectively +preserves high-frequency information, notably edges and boundaries, which tend +to erode as neural networks deepen. MEGANet is designed as an end-to-end +framework, encompassing three key modules: an encoder, which is responsible for +capturing and abstracting the features from the input image, a decoder, which +focuses on salient features, and the Edge-Guided Attention module (EGA) that +employs the Laplacian Operator to accentuate polyp boundaries. Extensive +experiments, both qualitative and quantitative, on five benchmark datasets, +demonstrate that our EGANet outperforms other existing SOTA methods under six +evaluation metrics. Our code is available at +\url{https://github.com/DinhHieuHoang/MEGANet} + +
+
+
+
+
+ + ☆ CoNeS: Conditional neural fields with shift modulation for + multi-sequence MRI translation + + +
+ Multi-sequence magnetic resonance imaging (MRI) has found wide applications +in both modern clinical studies and deep learning research. However, in +clinical practice, it frequently occurs that one or more of the MRI sequences +are missing due to different image acquisition protocols or contrast agent +contraindications of patients, limiting the utilization of deep learning models +trained on multi-sequence data. One promising approach is to leverage +generative models to synthesize the missing sequences, which can serve as a +surrogate acquisition. State-of-the-art methods tackling this problem are based +on convolutional neural networks (CNN) which usually suffer from spectral +biases, resulting in poor reconstruction of high-frequency fine details. In +this paper, we propose Conditional Neural fields with Shift modulation (CoNeS), +a model that takes voxel coordinates as input and learns a representation of +the target images for multi-sequence MRI translation. The proposed model uses a +multi-layer perceptron (MLP) instead of a CNN as the decoder for pixel-to-pixel +mapping. Hence, each target image is represented as a neural field that is +conditioned on the source image via shift modulation with a learned latent +code. Experiments on BraTS 2018 and an in-house clinical dataset of vestibular +schwannoma patients showed that the proposed method outperformed +state-of-the-art methods for multi-sequence MRI translation both visually and +quantitatively. Moreover, we conducted spectral analysis, showing that CoNeS +was able to overcome the spectral bias issue common in conventional CNN models. +To further evaluate the usage of synthesized images in clinical downstream +tasks, we tested a segmentation network using the synthesized images at +inference. + +
+
+
+
+
+ + ☆ Comparative Analysis of Deep-Fake Algorithms + + +
+ Due to the widespread use of smartphones with high-quality digital cameras +and easy access to a wide range of software apps for recording, editing, and +sharing videos and images, as well as the deep learning AI platforms, a new +phenomenon of 'faking' videos has emerged. Deepfake algorithms can create fake +images and videos that are virtually indistinguishable from authentic ones. +Therefore, technologies that can detect and assess the integrity of digital +visual media are crucial. Deepfakes, also known as deep learning-based fake +videos, have become a major concern in recent years due to their ability to +manipulate and alter images and videos in a way that is virtually +indistinguishable from the original. These deepfake videos can be used for +malicious purposes such as spreading misinformation, impersonating individuals, +and creating fake news. Deepfake detection technologies use various approaches +such as facial recognition, motion analysis, and audio-visual synchronization +to identify and flag fake videos. However, the rapid advancement of deepfake +technologies has made it increasingly difficult to detect these videos with +high accuracy. In this paper, we aim to provide a comprehensive review of the +current state of deepfake creation and detection technologies. We examine the +various deep learning-based approaches used for creating deepfakes, as well as +the techniques used for detecting them. Additionally, we analyze the +limitations and challenges of current deepfake detection methods and discuss +future research directions in this field. Overall, the paper highlights the +importance of continued research and development in deepfake detection +technologies in order to combat the negative impact of deepfakes on society and +ensure the integrity of digital visual media. + +
+
+ comment: 7 pages, 4 figures, 2 tables, Published with International Journal of + Computer Science Trends and Technology (IJCST) +
+
+
+
+
+ + ☆ Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields + + +
+ Neural Radiance Fields (NeRFs) have shown promise in applications like view +synthesis and depth estimation, but learning from multiview images faces +inherent uncertainties. Current methods to quantify them are either heuristic +or computationally demanding. We introduce BayesRays, a post-hoc framework to +evaluate uncertainty in any pre-trained NeRF without modifying the training +process. Our method establishes a volumetric uncertainty field using spatial +perturbations and a Bayesian Laplace approximation. We derive our algorithm +statistically and show its superior performance in key metrics and +applications. Additional results available at: https://bayesrays.github.io. + +
+
+
+
+
+ + ♻ ☆ MF-NeRF: Memory Efficient NeRF with Mixed-Feature Hash Table + + +
+ Neural radiance field (NeRF) has shown remarkable performance in generating +photo-realistic novel views. Among recent NeRF related research, the approaches +that involve the utilization of explicit structures like grids to manage +features achieve exceptionally fast training by reducing the complexity of +multilayer perceptron (MLP) networks. However, storing features in dense grids +demands a substantial amount of memory space, resulting in a notable memory +bottleneck within computer system. Consequently, it leads to a significant +increase in training times without prior hyper-parameter tuning. To address +this issue, in this work, we are the first to propose MF-NeRF, a +memory-efficient NeRF framework that employs a Mixed-Feature hash table to +improve memory efficiency and reduce training time while maintaining +reconstruction quality. Specifically, we first design a mixed-feature hash +encoding to adaptively mix part of multi-level feature grids and map it to a +single hash table. Following that, in order to obtain the correct index of a +grid point, we further develop an index transformation method that transforms +indices of an arbitrary level grid to those of a canonical grid. Extensive +experiments benchmarking with state-of-the-art Instant-NGP, TensoRF, and DVGO, +indicate our MF-NeRF could achieve the fastest training time on the same GPU +hardware with similar or even higher reconstruction quality. + +
+
+
+
+
+ + ♻ ☆ EgoBlur: Responsible Innovation in Aria + + +
+ Project Aria pushes the frontiers of Egocentric AI with large-scale +real-world data collection using purposely designed glasses with privacy first +approach. To protect the privacy of bystanders being recorded by the glasses, +our research protocols are designed to ensure recorded video is processed by an +AI anonymization model that removes bystander faces and vehicle license plates. +Detected face and license plate regions are processed with a Gaussian blur such +that these personal identification information (PII) regions are obscured. This +process helps to ensure that anonymized versions of the video is retained for +research purposes. In Project Aria, we have developed a state-of-the-art +anonymization system EgoBlur. In this paper, we present extensive analysis of +EgoBlur on challenging datasets comparing its performance with other +state-of-the-art systems from industry and academia including extensive +Responsible AI analysis on recently released Casual Conversations V2 dataset. + +
+
+
+
+
+ + ♻ ☆ Loss Functions and Metrics in Deep Learning + + +
+ One of the essential components of deep learning is the choice of the loss +function and performance metrics used to train and evaluate models. This paper +reviews the most prevalent loss functions and performance measurements in deep +learning. We examine the benefits and limits of each technique and illustrate +their application to various deep-learning problems. Our review aims to give a +comprehensive picture of the different loss functions and performance +indicators used in the most common deep learning tasks and help practitioners +choose the best method for their specific task. + +
+
+ comment: 53 pages, 5 figures, 7 tables, 86 equations +
+
+
+
+
+ + ♻ ☆ CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote + Aggregation + + +
+ Object pose estimation constitutes a critical area within the domain of 3D +vision. While contemporary state-of-the-art methods that leverage real-world +pose annotations have demonstrated commendable performance, the procurement of +such real-world training data incurs substantial costs. This paper focuses on a +specific setting wherein only 3D CAD models are utilized as a priori knowledge, +devoid of any background or clutter information. We introduce a novel method, +CPPF++, designed for sim-to-real pose estimation. This method builds upon the +foundational point-pair voting scheme of CPPF, reconceptualizing it through a +probabilistic lens. To address the challenge of voting collision, we model +voting uncertainty by estimating the probabilistic distribution of each point +pair within the canonical space. This approach is further augmented by +iterative noise filtering, employed to eradicate votes associated with +backgrounds or clutters. Additionally, we enhance the context provided by each +voting unit by introducing $N$-point tuples. In conjunction with this +methodological contribution, we present a new category-level pose estimation +dataset, DiversePose 300. This dataset is specifically crafted to facilitate a +more rigorous evaluation of current state-of-the-art methods, encompassing a +broader and more challenging array of real-world scenarios. Empirical results +substantiate the efficacy of our proposed method, revealing a significant +reduction in the disparity between simulation and real-world performance. + +
+
+
+
+
+ + ♻ ☆ Defense-Prefix for Preventing Typographic Attacks on CLIP ICCV2023 + + +
+ Vision-language pre-training models (VLPs) have exhibited revolutionary +improvements in various vision-language tasks. In VLP, some adversarial attacks +fool a model into false or absurd classifications. Previous studies addressed +these attacks by fine-tuning the model or changing its architecture. However, +these methods risk losing the original model's performance and are difficult to +apply to downstream tasks. In particular, their applicability to other tasks +has not been considered. In this study, we addressed the reduction of the +impact of typographic attacks on CLIP without changing the model parameters. To +achieve this, we expand the idea of "prefix learning" and introduce our simple +yet effective method: Defense-Prefix (DP), which inserts the DP token before a +class name to make words "robust" against typographic attacks. Our method can +be easily applied to downstream tasks, such as object detection, because the +proposed method is independent of the model parameters. Our method +significantly improves the accuracy of classification tasks for typographic +attack datasets, while maintaining the zero-shot capabilities of the model. In +addition, we leverage our proposed method for object detection, demonstrating +its high applicability and effectiveness. The codes and datasets are available +at https://github.com/azuma164/Defense-Prefix. + +
+
+ comment: ICCV2023 Workshop +
+
+
+
+
+ + ♻ ☆ Extraction of Visual Information to Predict Crowdfunding Success + + +
+ Researchers have increasingly turned to crowdfunding platforms to gain +insights into entrepreneurial activity and dynamics. While previous studies +have explored various factors influencing crowdfunding success, such as +technology, communication, and marketing strategies, the role of visual +elements that can be automatically extracted from images has received less +attention. This is surprising, considering that crowdfunding platforms +emphasize the importance of attention-grabbing and high-resolution images, and +previous research has shown that image characteristics can significantly impact +product evaluations. Indeed, a comprehensive review of empirical articles (n = +202) that utilized Kickstarter data, focusing on the incorporation of visual +information in their analyses. Our findings reveal that only 29.70% controlled +for the number of images, and less than 12% considered any image details. In +this manuscript, we review the literature on image processing and its relevance +to the business domain, highlighting two types of visual variables: visual +counts (number of pictures and number of videos) and image details. Building +upon previous work that discussed the role of color, composition and +figure-ground relationships, we introduce visual scene elements that have not +yet been explored in crowdfunding, including the number of faces, the number of +concepts depicted, and the ease of identifying those concepts. To demonstrate +the predictive value of visual counts and image details, we analyze Kickstarter +data. Our results highlight that visual count features are two of the top three +predictors of success. Our results also show that simple image detail features +such as color matter a lot, and our proposed measures of visual scene elements +can also be useful. We supplement our article with R and Python codes that help +authors extract image details (https://osf.io/ujnzp/). + +
+
+ comment: 32 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Infinite Class Mixup BMVC 2023 + + +
+ Mixup is a widely adopted strategy for training deep networks, where +additional samples are augmented by interpolating inputs and labels of training +pairs. Mixup has shown to improve classification performance, network +calibration, and out-of-distribution generalisation. While effective, a +cornerstone of Mixup, namely that networks learn linear behaviour patterns +between classes, is only indirectly enforced since the output interpolation is +performed at the probability level. This paper seeks to address this limitation +by mixing the classifiers directly instead of mixing the labels for each mixed +pair. We propose to define the target of each augmented sample as a uniquely +new classifier, whose parameters are a linear interpolation of the classifier +vectors of the input pair. The space of all possible classifiers is continuous +and spans all interpolations between classifier pairs. To make optimisation +tractable, we propose a dual-contrastive Infinite Class Mixup loss, where we +contrast the classifier of a mixed pair to both the classifiers and the +predicted outputs of other mixed pairs in a batch. Infinite Class Mixup is +generic in nature and applies to many variants of Mixup. Empirically, we show +that it outperforms standard Mixup and variants such as RegMixup and Remix on +balanced, long-tailed, and data-constrained benchmarks, highlighting its broad +applicability. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Forecasting Future Instance Segmentation with Learned Optical Flow and + Warping + + +
+ For an autonomous vehicle it is essential to observe the ongoing dynamics of +a scene and consequently predict imminent future scenarios to ensure safety to +itself and others. This can be done using different sensors and modalities. In +this paper we investigate the usage of optical flow for predicting future +semantic segmentations. To do so we propose a model that forecasts flow fields +autoregressively. Such predictions are then used to guide the inference of a +learned warping function that moves instance segmentations on to future frames. +Results on the Cityscapes dataset demonstrate the effectiveness of optical-flow +methods. + +
+
+ comment: Paper published as Poster at ICIAP21 +
+
+
+
+
+ + ♻ ☆ Deep Metric Learning with Chance Constraints WACV + + +
+ Deep metric learning (DML) aims to minimize empirical expected loss of the +pairwise intra-/inter- class proximity violations in the embedding space. We +relate DML to feasibility problem of finite chance constraints. We show that +minimizer of proxy-based DML satisfies certain chance constraints, and that the +worst case generalization performance of the proxy-based methods can be +characterized by the radius of the smallest ball around a class proxy to cover +the entire domain of the corresponding class samples, suggesting multiple +proxies per class helps performance. To provide a scalable algorithm as well as +exploiting more proxies, we consider the chance constraints implied by the +minimizers of proxy-based DML instances and reformulate DML as finding a +feasible point in intersection of such constraints, resulting in a problem to +be approximately solved by iterative projections. Simply put, we repeatedly +train a regularized proxy-based loss and re-initialize the proxies with the +embeddings of the deliberately selected new samples. We applied our method with +4 well-accepted DML losses and show the effectiveness with extensive +evaluations on 4 popular DML benchmarks. Code is available at: +https://github.com/yetigurbuz/ccp-dml + +
+
+ comment: Accepted as a conference paper at IEEE/CVF Winter Conference on + Applications of Computer Vision (WACV) 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Adversarial Attacks: The Similar Target Method + + +
+ Deep neural networks are vulnerable to adversarial examples, posing a threat +to the models' applications and raising security concerns. An intriguing +property of adversarial examples is their strong transferability. Several +methods have been proposed to enhance transferability, including ensemble +attacks which have demonstrated their efficacy. However, prior approaches +simply average logits, probabilities, or losses for model ensembling, lacking a +comprehensive analysis of how and why model ensembling significantly improves +transferability. In this paper, we propose a similar targeted attack method +named Similar Target~(ST). By promoting cosine similarity between the gradients +of each model, our method regularizes the optimization direction to +simultaneously attack all surrogate models. This strategy has been proven to +enhance generalization ability. Experimental results on ImageNet validate the +effectiveness of our approach in improving adversarial transferability. Our +method outperforms state-of-the-art attackers on 18 discriminative classifiers +and adversarially trained models. + +
+
+
+
+
+ + ♻ ☆ Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition + and Translation + + +
+ In this paper, we devise a mechanism for the addition of multi-modal +information with an existing pipeline for continuous sign language recognition +and translation. In our procedure, we have incorporated optical flow +information with RGB images to enrich the features with movement-related +information. This work studies the feasibility of such modality inclusion using +a cross-modal encoder. The plugin we have used is very lightweight and doesn't +need to include a separate feature extractor for the new modality in an +end-to-end manner. We have applied the changes in both sign language +recognition and translation, improving the result in each case. We have +evaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language +recognition and the RWTH-PHOENIX-2014T dataset for translation. On the +recognition task, our approach reduced the WER by 0.9, and on the translation +task, our approach increased most of the BLEU scores by ~0.6 on the test set. + +
+
+ comment: This version has some errors. Our schedule is packed, so we don't + have enough time to correct it. We will share another work when we have time + to fix this +
+
+
+
+
+ + ♻ ☆ H3WB: Human3.6M 3D WholeBody Dataset and Benchmark ICCV 2023 + + +
+ We present a benchmark for 3D human whole-body pose estimation, which +involves identifying accurate 3D keypoints on the entire human body, including +face, hands, body, and feet. Currently, the lack of a fully annotated and +accurate 3D whole-body dataset results in deep networks being trained +separately on specific body parts, which are combined during inference. Or they +rely on pseudo-groundtruth provided by parametric body models which are not as +accurate as detection based methods. To overcome these issues, we introduce the +Human3.6M 3D WholeBody (H3WB) dataset, which provides whole-body annotations +for the Human3.6M dataset using the COCO Wholebody layout. H3WB comprises 133 +whole-body keypoint annotations on 100K images, made possible by our new +multi-view pipeline. We also propose three tasks: i) 3D whole-body pose lifting +from 2D complete whole-body pose, ii) 3D whole-body pose lifting from 2D +incomplete whole-body pose, and iii) 3D whole-body pose estimation from a +single RGB image. Additionally, we report several baselines from popular +methods for these tasks. Furthermore, we also provide automated 3D whole-body +annotations of TotalCapture and experimentally show that when used with H3WB it +helps to improve the performance. Code and dataset is available at +https://github.com/wholebody3d/wholebody3d + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Unifying Synergies between Self-supervised Learning and Dynamic + Computation + + +
+ Computationally expensive training strategies make self-supervised learning +(SSL) impractical for resource constrained industrial settings. Techniques like +knowledge distillation (KD), dynamic computation (DC), and pruning are often +used to obtain a lightweightmodel, which usually involves multiple epochs of +fine-tuning (or distilling steps) of a large pre-trained model, making it more +computationally challenging. In this work we present a novel perspective on the +interplay between SSL and DC paradigms. In particular, we show that it is +feasible to simultaneously learn a dense and gated sub-network from scratch in +a SSL setting without any additional fine-tuning or pruning steps. The +co-evolution during pre-training of both dense and gated encoder offers a good +accuracy-efficiency trade-off and therefore yields a generic and multi-purpose +architecture for application specific industrial settings. Extensive +experiments on several image classification benchmarks including CIFAR-10/100, +STL-10 and ImageNet-100, demonstrate that the proposed training strategy +provides a dense and corresponding gated sub-network that achieves on-par +performance compared with the vanilla self-supervised setting, but at a +significant reduction in computation in terms of FLOPs, under a range of target +budgets (td ). + +
+
+
+
+
+ + ♻ ☆ ARF-Plus: Controlling Perceptual Factors in Artistic Radiance Fields for + 3D Scene Stylization + + +
+ The radiance fields style transfer is an emerging field that has recently +gained popularity as a means of 3D scene stylization, thanks to the outstanding +performance of neural radiance fields in 3D reconstruction and view synthesis. +We highlight a research gap in radiance fields style transfer, the lack of +sufficient perceptual controllability, motivated by the existing concept in the +2D image style transfer. In this paper, we present ARF-Plus, a 3D neural style +transfer framework offering manageable control over perceptual factors, to +systematically explore the perceptual controllability in 3D scene stylization. +Four distinct types of controls - color preservation control, (style pattern) +scale control, spatial (selective stylization area) control, and depth +enhancement control - are proposed and integrated into this framework. Results +from real-world datasets, both quantitative and qualitative, show that the four +types of controls in our ARF-Plus framework successfully accomplish their +corresponding perceptual controls when stylizing 3D scenes. These techniques +work well for individual style inputs as well as for the simultaneous +application of multiple styles within a scene. This unlocks a realm of +limitless possibilities, allowing customized modifications of stylization +effects and flexible merging of the strengths of different styles, ultimately +enabling the creation of novel and eye-catching stylistic effects on 3D scenes. + +
+
+
+
+
+ + ♻ ☆ Robustifying Token Attention for Vision Transformers ICCV 2023 + + +
+ Despite the success of vision transformers (ViTs), they still suffer from +significant drops in accuracy in the presence of common corruptions, such as +noise or blur. Interestingly, we observe that the attention mechanism of ViTs +tends to rely on few important tokens, a phenomenon we call token overfocusing. +More critically, these tokens are not robust to corruptions, often leading to +highly diverging attention patterns. In this paper, we intend to alleviate this +overfocusing issue and make attention more stable through two general +techniques: First, our Token-aware Average Pooling (TAP) module encourages the +local neighborhood of each token to take part in the attention mechanism. +Specifically, TAP learns average pooling schemes for each token such that the +information of potentially important tokens in the neighborhood can adaptively +be taken into account. Second, we force the output tokens to aggregate +information from a diverse set of input tokens rather than focusing on just a +few by using our Attention Diversification Loss (ADL). We achieve this by +penalizing high cosine similarity between the attention vectors of different +tokens. In experiments, we apply our methods to a wide range of transformer +architectures and improve robustness significantly. For example, we improve +corruption robustness on ImageNet-C by 2.4% while improving accuracy by 0.4% +based on state-of-the-art robust architecture FAN. Also, when fine-tuning on +semantic segmentation tasks, we improve robustness on CityScapes-C by 2.4% and +ACDC by 3.0%. Our code is available at https://github.com/guoyongcs/TAPADL. + +
+
+ comment: To appear in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Parameter and Computation Efficient Transfer Learning for + Vision-Language Pre-trained Models + + +
+ With ever increasing parameters and computation, vision-language pre-trained +(VLP) models exhibit prohibitive expenditure in downstream task adaption. +Recent endeavors mainly focus on parameter efficient transfer learning (PETL) +for VLP models by only updating a small number of parameters. However, +excessive computational overhead still plagues the application of VLPs. In this +paper, we aim at parameter and computation efficient transfer learning (PCETL) +for VLP models. In particular, PCETL not only needs to limit the number of +trainable parameters in VLP models, but also to reduce the computational +redundancy during inference, thus enabling a more efficient transfer. To +approach this target, we propose a novel dynamic architecture skipping (DAS) +approach towards effective PCETL. Instead of directly optimizing the intrinsic +architectures of VLP models, DAS first observes the significances of their +modules to downstream tasks via a reinforcement learning (RL) based process, +and then skips the redundant ones with lightweight networks, i.e., adapters, +according to the obtained rewards. In this case, the VLP model can well +maintain the scale of trainable parameters while speeding up its inference on +downstream tasks. To validate DAS, we apply it to two representative VLP +models, namely ViLT and METER, and conduct extensive experiments on a bunch of +VL tasks. The experimental results not only show the great advantages of DAS in +reducing computational complexity, e.g. -11.97% FLOPs of METER on VQA2.0, but +also confirm its competitiveness against existing PETL methods in terms of +parameter scale and performance. Our source code is given in our appendix. + +
+
+
+
+
+ + ♻ ☆ An Empirical Analysis for Zero-Shot Multi-Label Classification on + COVID-19 CT Scans and Uncurated Reports ICCV + + +
+ The pandemic resulted in vast repositories of unstructured data, including +radiology reports, due to increased medical examinations. Previous research on +automated diagnosis of COVID-19 primarily focuses on X-ray images, despite +their lower precision compared to computed tomography (CT) scans. In this work, +we leverage unstructured data from a hospital and harness the fine-grained +details offered by CT scans to perform zero-shot multi-label classification +based on contrastive visual language learning. In collaboration with human +experts, we investigate the effectiveness of multiple zero-shot models that aid +radiologists in detecting pulmonary embolisms and identifying intricate lung +details like ground glass opacities and consolidations. Our empirical analysis +provides an overview of the possible solutions to target such fine-grained +tasks, so far overlooked in the medical multimodal pretraining literature. Our +investigation promises future advancements in the medical image analysis +community by addressing some challenges associated with unstructured data and +fine-grained multi-label classification. + +
+
+ comment: Proceedings of the IEEE/CVF International Conference on Computer + Vision (ICCV) Workshops 2023 +
+
+
+
+
+ + ♻ ☆ UncLe-SLAM: Uncertainty Learning for Dense Neural SLAM ICCV 2023 + + +
+ We present an uncertainty learning framework for dense neural simultaneous +localization and mapping (SLAM). Estimating pixel-wise uncertainties for the +depth input of dense SLAM methods allows re-weighing the tracking and mapping +losses towards image regions that contain more suitable information that is +more reliable for SLAM. To this end, we propose an online framework for sensor +uncertainty estimation that can be trained in a self-supervised manner from +only 2D input data. We further discuss the advantages of the uncertainty +learning for the case of multi-sensor input. Extensive analysis, +experimentation, and ablations show that our proposed modeling paradigm +improves both mapping and tracking accuracy and often performs better than +alternatives that require ground truth depth or 3D. Our experiments show that +we achieve a 38\% and 27\% lower absolute trajectory tracking error (ATE) on +the 7-Scenes and TUM-RGBD datasets respectively. On the popular Replica dataset +using two types of depth sensors, we report an 11\% F1-score improvement on +RGBD SLAM compared to the recent state-of-the-art neural implicit approaches. +Source code: https://github.com/kev-in-ta/UncLe-SLAM. + +
+
+ comment: ICCV 2023 Workshop. 20 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Leveraging TCN and Transformer for effective visual-audio fusion in + continuous emotion recognition CVPR + + +
+ Human emotion recognition plays an important role in human-computer +interaction. In this paper, we present our approach to the Valence-Arousal (VA) +Estimation Challenge, Expression (Expr) Classification Challenge, and Action +Unit (AU) Detection Challenge of the 5th Workshop and Competition on Affective +Behavior Analysis in-the-wild (ABAW). Specifically, we propose a novel +multi-modal fusion model that leverages Temporal Convolutional Networks (TCN) +and Transformer to enhance the performance of continuous emotion recognition. +Our model aims to effectively integrate visual and audio information for +improved accuracy in recognizing emotions. Our model outperforms the baseline +and ranks 3 in the Expression Classification challenge. + +
+
+ comment: 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition + Workshops (CVPRW) +
+
+
+
+
+ + ♻ ☆ Watch Where You Head: A View-biased Domain Gap in Gait Recognition and + Unsupervised Adaptation + + +
+ Gait Recognition is a computer vision task aiming to identify people by their +walking patterns. Although existing methods often show high performance on +specific datasets, they lack the ability to generalize to unseen scenarios. +Unsupervised Domain Adaptation (UDA) tries to adapt a model, pre-trained in a +supervised manner on a source domain, to an unlabelled target domain. There are +only a few works on UDA for gait recognition proposing solutions to limited +scenarios. In this paper, we reveal a fundamental phenomenon in adaptation of +gait recognition models, caused by the bias in the target domain to viewing +angle or walking direction. We then suggest a remedy to reduce this bias with a +novel triplet selection strategy combined with curriculum learning. To this +end, we present Gait Orientation-based method for Unsupervised Domain +Adaptation (GOUDA). We provide extensive experiments on four widely-used gait +datasets, CASIA-B, OU-MVLP, GREW, and Gait3D, and on three backbones, GaitSet, +GaitPart, and GaitGL, justifying the view bias and showing the superiority of +our proposed method over prior UDA works. + +
+
+
+
+
+ + ♻ ☆ Large Separable Kernel Attention: Rethinking the Large Kernel Attention + Design in CNN + + +
+ Visual Attention Networks (VAN) with Large Kernel Attention (LKA) modules +have been shown to provide remarkable performance, that surpasses Vision +Transformers (ViTs), on a range of vision-based tasks. However, the depth-wise +convolutional layer in these LKA modules incurs a quadratic increase in the +computational and memory footprints with increasing convolutional kernel size. +To mitigate these problems and to enable the use of extremely large +convolutional kernels in the attention modules of VAN, we propose a family of +Large Separable Kernel Attention modules, termed LSKA. LSKA decomposes the 2D +convolutional kernel of the depth-wise convolutional layer into cascaded +horizontal and vertical 1-D kernels. In contrast to the standard LKA design, +the proposed decomposition enables the direct use of the depth-wise +convolutional layer with large kernels in the attention module, without +requiring any extra blocks. We demonstrate that the proposed LSKA module in VAN +can achieve comparable performance with the standard LKA module and incur lower +computational complexity and memory footprints. We also find that the proposed +LSKA design biases the VAN more toward the shape of the object than the texture +with increasing kernel size. Additionally, we benchmark the robustness of the +LKA and LSKA in VAN, ViTs, and the recent ConvNeXt on the five corrupted +versions of the ImageNet dataset that are largely unexplored in the previous +works. Our extensive experimental results show that the proposed LSKA module in +VAN provides a significant reduction in computational complexity and memory +footprints with increasing kernel size while outperforming ViTs, ConvNeXt, and +providing similar performance compared to the LKA module in VAN on object +recognition, object detection, semantic segmentation, and robustness tests. + +
+
+
+
+
+ + ♻ ☆ A Structure-Guided Diffusion Model for Large-Hole Image Completion BMVC2023 + + +
+ Image completion techniques have made significant progress in filling missing +regions (i.e., holes) in images. However, large-hole completion remains +challenging due to limited structural information. In this paper, we address +this problem by integrating explicit structural guidance into diffusion-based +image completion, forming our structure-guided diffusion model (SGDM). It +consists of two cascaded diffusion probabilistic models: structure and texture +generators. The structure generator generates an edge image representing +plausible structures within the holes, which is then used for guiding the +texture generation process. To train both generators jointly, we devise a novel +strategy that leverages optimal Bayesian denoising, which denoises the output +of the structure generator in a single step and thus allows backpropagation. +Our diffusion-based approach enables a diversity of plausible completions, +while the editable edges allow for editing parts of an image. Our experiments +on natural scene (Places) and face (CelebA-HQ) datasets demonstrate that our +method achieves a superior or comparable visual quality compared to +state-of-the-art approaches. The code is available for research purposes at +https://github.com/UdonDa/Structure_Guided_Diffusion_Model. + +
+
+ comment: BMVC2023. Code: + https://github.com/UdonDa/Structure_Guided_Diffusion_Model +
+
+
+
+
+ + ♻ ☆ Learning-based Spatial and Angular Information Separation for Light + Field Compression + + +
+ Light fields are a type of image data that capture both spatial and angular +scene information by recording light rays emitted by a scene from different +orientations. In this context, spatial information is defined as features that +remain static regardless of perspectives, while angular information refers to +features that vary between viewpoints. We propose a novel neural network that, +by design, can separate angular and spatial information of a light field. The +network represents spatial information using spatial kernels shared among all +Sub-Aperture Images (SAIs), and angular information using sets of angular +kernels for each SAI. To further improve the representation capability of the +network without increasing parameter number, we also introduce angular kernel +allocation and kernel tensor decomposition mechanisms. Extensive experiments +demonstrate the benefits of information separation: when applied to the +compression task, our network outperforms other state-of-the-art methods by a +large margin. And angular information can be easily transferred to other scenes +for rendering dense views, showing the successful separation and the potential +use case for the view synthesis task. We plan to release the code upon +acceptance of the paper to encourage further research on this topic. + +
+
+ comment: The authors would like to withdraw this paper, as it has been + superseded by arXiv:2307.06143 +
+
+
+
+
+ + ♻ ☆ Delving into Ipsilateral Mammogram Assessment under Multi-View Network + + +
+ In many recent years, multi-view mammogram analysis has been focused widely +on AI-based cancer assessment. In this work, we aim to explore diverse fusion +strategies (average and concatenate) and examine the model's learning behavior +with varying individuals and fusion pathways, involving Coarse Layer and Fine +Layer. The Ipsilateral Multi-View Network, comprising five fusion types (Pre, +Early, Middle, Last, and Post Fusion) in ResNet-18, is employed. Notably, the +Middle Fusion emerges as the most balanced and effective approach, enhancing +deep-learning models' generalization performance by +2.06% (concatenate) and ++5.29% (average) in VinDr-Mammo dataset and +2.03% (concatenate) and +3% +(average) in CMMD dataset on macro F1-Score. The paper emphasizes the crucial +role of layer assignment in multi-view network extraction with various +strategies. + +
+
+
+
+
+ + ♻ ☆ Attentive Contractive Flow with Lipschitz-constrained Self-Attention BMVC 2023 + + +
+ Normalizing flows provide an elegant method for obtaining tractable density +estimates from distributions by using invertible transformations. The main +challenge is to improve the expressivity of the models while keeping the +invertibility constraints intact. We propose to do so via the incorporation of +localized self-attention. However, conventional self-attention mechanisms don't +satisfy the requirements to obtain invertible flows and can't be naively +incorporated into normalizing flows. To address this, we introduce a novel +approach called Attentive Contractive Flow (ACF) which utilizes a special +category of flow-based generative models - contractive flows. We demonstrate +that ACF can be introduced into a variety of state of the art flow models in a +plug-and-play manner. This is demonstrated to not only improve the +representation power of these models (improving on the bits per dim metric), +but also to results in significantly faster convergence in training them. +Qualitative results, including interpolations between test images, demonstrate +that samples are more realistic and capture local correlations in the data +well. We evaluate the results further by performing perturbation analysis using +AWGN demonstrating that ACF models (especially the dot-product variant) show +better and more consistent resilience to additive noise. + +
+
+ comment: 10 pages, to be published at BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Mind The Edge: Refining Depth Edges in Sparsely-Supervised Monocular + Depth Estimation + + +
+ Monocular Depth Estimation (MDE) is a fundamental problem in computer vision +with numerous applications. Recently, LIDAR-supervised methods have achieved +remarkable per-pixel depth accuracy in outdoor scenes. However, significant +errors are typically found in the proximity of depth discontinuities, i.e., +depth edges, which often hinder the performance of depth-dependent applications +that are sensitive to such inaccuracies, e.g., novel view synthesis and +augmented reality. Since direct supervision for the location of depth edges is +typically unavailable in sparse LIDAR-based scenes, encouraging the MDE model +to produce correct depth edges is not straightforward. To the best of our +knowledge this paper is the first attempt to address the depth edges issue for +LIDAR-supervised scenes. In this work we propose to learn to detect the +location of depth edges from densely-supervised synthetic data, and use it to +generate supervision for the depth edges in the MDE training. %Despite the +'domain gap' between synthetic and real data, we show that depth edges that are +estimated directly are significantly more accurate than the ones that emerge +indirectly from the MDE training. To quantitatively evaluate our approach, and +due to the lack of depth edges ground truth in LIDAR-based scenes, we manually +annotated subsets of the KITTI and the DDAD datasets with depth edges ground +truth. We demonstrate significant gains in the accuracy of the depth edges with +comparable per-pixel depth accuracy on several challenging datasets. + +
+
+
+
+
+ + ♻ ☆ OCTScenes: A Versatile Real-World Dataset of Tabletop Scenes for + Object-Centric Learning + + +
+ Humans possess the cognitive ability to comprehend scenes in a compositional +manner. To empower AI systems with similar capabilities, object-centric +learning aims to acquire representations of individual objects from visual +scenes without any supervision. Although recent advances in object-centric +learning have made remarkable progress on complex synthesis datasets, there is +a huge challenge for application to complex real-world scenes. One of the +essential reasons is the scarcity of real-world datasets specifically tailored +to object-centric learning. To address this problem, we propose a versatile +real-world dataset of tabletop scenes for object-centric learning called +OCTScenes, which is meticulously designed to serve as a benchmark for +comparing, evaluating, and analyzing object-centric learning methods. OCTScenes +contains 5000 tabletop scenes with a total of 15 objects. Each scene is +captured in 60 frames covering a 360-degree perspective. Consequently, +OCTScenes is a versatile benchmark dataset that can simultaneously satisfy the +evaluation of object-centric learning methods based on single-image, video, and +multi-view. Extensive experiments of representative object-centric learning +methods are conducted on OCTScenes. The results demonstrate the shortcomings of +state-of-the-art methods for learning meaningful representations from +real-world data, despite their impressive performance on complex synthesis +datasets. Furthermore, OCTScenes can serve as a catalyst for the advancement of +existing methods, inspiring them to adapt to real-world scenes. Dataset and +code are available at https://huggingface.co/datasets/Yinxuan/OCTScenes. + +
+
+
+
+
+ + ♻ ☆ Neural-IMLS: Self-supervised Implicit Moving Least-Squares Network for + Surface Reconstruction + + +
+ Surface reconstruction is very challenging when the input point clouds, +particularly real scans, are noisy and lack normals. Observing that the +Multilayer Perceptron (MLP) and the implicit moving least-square function +(IMLS) provide a dual representation of the underlying surface, we introduce +Neural-IMLS, a novel approach that directly learns the noise-resistant signed +distance function (SDF) from unoriented raw point clouds in a self-supervised +fashion. We use the IMLS to regularize the distance values reported by the MLP +while using the MLP to regularize the normals of the data points for running +the IMLS. We also prove that at the convergence, our neural network, benefiting +from the mutual learning mechanism between the MLP and the IMLS, produces a +faithful SDF whose zero-level set approximates the underlying surface. We +conducted extensive experiments on various benchmarks, including synthetic +scans and real scans. The experimental results show that {\em Neural-IMLS} can +reconstruct faithful shapes on various benchmarks with noise and missing parts. +The source code can be found at~\url{https://github.com/bearprin/Neural-IMLS}. + +
+
+
+
+
+ + ♻ ☆ Neural-Singular-Hessian: Implicit Neural Representation of Unoriented + Point Clouds by Enforcing Singular Hessian + + +
+ Neural implicit representation is a promising approach for reconstructing +surfaces from point clouds. Existing methods combine various regularization +terms, such as the Eikonal and Laplacian energy terms, to enforce the learned +neural function to possess the properties of a Signed Distance Function (SDF). +However, inferring the actual topology and geometry of the underlying surface +from poor-quality unoriented point clouds remains challenging. In accordance +with Differential Geometry, the Hessian of the SDF is singular for points +within the differential thin-shell space surrounding the surface. Our approach +enforces the Hessian of the neural implicit function to have a zero determinant +for points near the surface. This technique aligns the gradients for a +near-surface point and its on-surface projection point, producing a rough but +faithful shape within just a few iterations. By annealing the weight of the +singular-Hessian term, our approach ultimately produces a high-fidelity +reconstruction result. Extensive experimental results demonstrate that our +approach effectively suppresses ghost geometry and recovers details from +unoriented point clouds with better expressiveness than existing fitting-based +methods. + +
+
+
+
+
+ + ♻ ☆ NICE 2023 Zero-shot Image Captioning Challenge + + +
+ In this report, we introduce NICE +project\footnote{\url{https://nice.lgresearch.ai/}} and share the results and +outcomes of NICE challenge 2023. This project is designed to challenge the +computer vision community to develop robust image captioning models that +advance the state-of-the-art both in terms of accuracy and fairness. Through +the challenge, the image captioning models were tested using a new evaluation +dataset that includes a large variety of visual concepts from many domains. +There was no specific training data provided for the challenge, and therefore +the challenge entries were required to adapt to new types of image descriptions +that had not been seen during training. This report includes information on the +newly proposed NICE dataset, evaluation methods, challenge results, and +technical details of top-ranking entries. We expect that the outcomes of the +challenge will contribute to the improvement of AI models on various +vision-language tasks. + +
+
+ comment: Tech report, project page https://nice.lgresearch.ai/ +
+
+
+
+
+ + ♻ ☆ GCD-DDPM: A Generative Change Detection Model Based on + Difference-Feature Guided DDPM + + +
+ Deep learning (DL)-based methods have recently shown great promise in +bitemporal change detection (CD). However, most existing methods are +ineffective in simultaneously capturing long-range dependencies and exploiting +local spatial information, resulting in inaccurate CD maps with discerning +edges. To overcome these obstacles, a novel Denoising Diffusion Probabilistic +Model (DDPM)-based generative CD approach called GCD-DDPM is proposed for +remote sensing data. More specifically, GCD-DDPM is designed to directly +generate CD maps by leveraging variational inference, which enables GCD-DDPM to +accurately distinguish subtle and irregular buildings or natural scenes from +the background. Furthermore, an adaptive calibration conditional difference +encoding technique is proposed for GCD-DDPM to enhance the CD map through +guided sampling of the differences among multi-level features. Finally, a noise +suppression-based semantic enhancer (NSSE) is devised to cope with the +high-frequency noise incurred in the CD map by capitalizing on the prior +knowledge derived from the current step. Extensive experiments on four CD +datasets, namely CDD, WHU, Levier and GVLM, confirm the good performance of the +proposed GCD-DDPM. + +
+
+
+
+
+ + ♻ ☆ Bi-Mapper: Holistic BEV Semantic Mapping for Autonomous Driving + + +
+ A semantic map of the road scene, covering fundamental road elements, is an +essential ingredient in autonomous driving systems. It provides important +perception foundations for positioning and planning when rendered in the +Bird's-Eye-View (BEV). Currently, the prior knowledge of hypothetical depth can +guide the learning of translating front perspective views into BEV directly +with the help of calibration parameters. However, it suffers from geometric +distortions in the representation of distant objects. In addition, another +stream of methods without prior knowledge can learn the transformation between +front perspective views and BEV implicitly with a global view. Considering that +the fusion of different learning methods may bring surprising beneficial +effects, we propose a Bi-Mapper framework for top-down road-scene semantic +understanding, which incorporates a global view and local prior knowledge. To +enhance reliable interaction between them, an asynchronous mutual learning +strategy is proposed. At the same time, an Across-Space Loss (ASL) is designed +to mitigate the negative impact of geometric distortions. Extensive results on +nuScenes and Cam2BEV datasets verify the consistent effectiveness of each +module in the proposed Bi-Mapper framework. Compared with exiting road mapping +networks, the proposed Bi-Mapper achieves 2.1% higher IoU on the nuScenes +dataset. Moreover, we verify the generalization performance of Bi-Mapper in a +real-world driving scenario. The source code is publicly available at +https://github.com/lynn-yu/Bi-Mapper. + +
+
+ comment: Accepted to IEEE Robotics and Automation Letters (RA-L). The source + code is publicly available at https://github.com/lynn-yu/Bi-Mapper +
+
+
+
+
+ + ♻ ☆ Learning in a Single Domain for Non-Stationary Multi-Texture Synthesis + + +
+ This paper aims for a new generation task: non-stationary multi-texture +synthesis, which unifies synthesizing multiple non-stationary textures in a +single model. Most non-stationary textures have large scale variance and can +hardly be synthesized through one model. To combat this, we propose a +multi-scale generator to capture structural patterns of various scales and +effectively synthesize textures with a minor cost. However, it is still hard to +handle textures of different categories with different texture patterns. +Therefore, we present a category-specific training strategy to focus on +learning texture pattern of a specific domain. Interestingly, once trained, our +model is able to produce multi-pattern generations with dynamic variations +without the need to finetune the model for different styles. Moreover, an +objective evaluation metric is designed for evaluating the quality of texture +expansion and global structure consistency. To our knowledge, ours is the first +scheme for this challenging task, including model, training, and evaluation. +Experimental results demonstrate the proposed method achieves superior +performance and time efficiency. The code will be available after the +publication. + +
+
+
+
+
+ + ♻ ☆ Estimating 3D Dental Structures using Simulated Panoramic Radiographs + and Neural Ray Tracing + + +
+ Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality +for dental examination. However, PX only provides a flattened 2D image, lacking +in a 3D view of the oral structure. In this paper, we propose a framework to +estimate 3D oral structures from real-world PX. Our framework tackles full 3D +reconstruction for varying subjects (patients) where each reconstruction is +based only on a single panoramic image. We create an intermediate +representation called simulated PX (SimPX) from 3D Cone-beam computed +tomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and +rotational principles of PX imaging. SimPX aims at not only truthfully +simulating PX, but also facilitates the reverting process back to 3D data. We +propose a novel neural model based on ray tracing which exploits both global +and local input features to convert SimPX to 3D output. At inference, a real PX +image is translated to a SimPX-style image with semantic regularization, and +the translated image is processed by generation module to produce high-quality +outputs. Experiments show that our method outperforms prior state-of-the-art in +reconstruction tasks both quantitatively and qualitatively. Unlike prior +methods, Our method does not require any prior information such as the shape of +dental arches, nor the matched PX-CBCT dataset for training, which is difficult +to obtain in clinical practice. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ TSTTC: A Large-Scale Dataset for Time-to-Contact Estimation in Driving + Scenarios + + +
+ Time-to-Contact (TTC) estimation is a critical task for assessing collision +risk and is widely used in various driver assistance and autonomous driving +systems. The past few decades have witnessed development of related theories +and algorithms. The prevalent learning-based methods call for a large-scale TTC +dataset in real-world scenarios. In this work, we present a large-scale object +oriented TTC dataset in the driving scene for promoting the TTC estimation by a +monocular camera. To collect valuable samples and make data with different TTC +values relatively balanced, we go through thousands of hours of driving data +and select over 200K sequences with a preset data distribution. To augment the +quantity of small TTC cases, we also generate clips using the latest Neural +rendering methods. Additionally, we provide several simple yet effective TTC +estimation baselines and evaluate them extensively on the proposed dataset to +demonstrate their effectiveness. The proposed dataset is publicly available at +https://open-dataset.tusen.ai/TSTTC. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Layout and Task Aware Instruction Prompt for Zero-shot Document Image + Question Answering + + +
+ The pre-training-fine-tuning paradigm based on layout-aware multimodal +pre-trained models has achieved significant progress on document image question +answering. However, domain pre-training and task fine-tuning for additional +visual, layout, and task modules prevent them from directly utilizing +off-the-shelf instruction-tuning language foundation models, which have +recently shown promising potential in zero-shot learning. Contrary to aligning +language models to the domain of document image question answering, we align +document image question answering to off-the-shell instruction-tuning language +foundation models to utilize their zero-shot capability. Specifically, we +propose layout and task aware instruction prompt called LATIN-Prompt, which +consists of layout-aware document content and task-aware descriptions. The +former recovers the layout information among text segments from OCR tools by +appropriate spaces and line breaks. The latter ensures that the model generates +answers that meet the requirements, especially format requirements, through a +detailed description of task. Experimental results on three benchmarks show +that LATIN-Prompt can improve the zero-shot performance of instruction-tuning +language foundation models on document image question answering and help them +achieve comparable levels to SOTAs based on the pre-training-fine-tuning +paradigm. Quantitative analysis and qualitative analysis demonstrate the +effectiveness of LATIN-Prompt. We provide the code in supplementary and will +release the code to facilitate future research. + +
+
+ comment: Add the LATIN-Tuning for Alapca. Code is available at + https://github.com/WenjinW/LATIN-Prompt +
+
+
+
+
+ + ♻ ☆ Holistically-Attracted Wireframe Parsing: From Supervised to + Self-Supervised Learning + + +
+ This article presents Holistically-Attracted Wireframe Parsing (HAWP), a +method for geometric analysis of 2D images containing wireframes formed by line +segments and junctions. HAWP utilizes a parsimonious Holistic Attraction (HAT) +field representation that encodes line segments using a closed-form 4D +geometric vector field. The proposed HAWP consists of three sequential +components empowered by end-to-end and HAT-driven designs: (1) generating a +dense set of line segments from HAT fields and endpoint proposals from +heatmaps, (2) binding the dense line segments to sparse endpoint proposals to +produce initial wireframes, and (3) filtering false positive proposals through +a novel endpoint-decoupled line-of-interest aligning (EPD LOIAlign) module that +captures the co-occurrence between endpoint proposals and HAT fields for better +verification. Thanks to our novel designs, HAWPv2 shows strong performance in +fully supervised learning, while HAWPv3 excels in self-supervised learning, +achieving superior repeatability scores and efficient training (24 GPU hours on +a single GPU). Furthermore, HAWPv3 exhibits a promising potential for wireframe +parsing in out-of-distribution images without providing ground truth labels of +wireframes. + +
+
+ comment: Journal extension of arXiv:2003.01663; Accepted by IEEE TPAMI; Code + is available at https://github.com/cherubicxn/hawp +
+
+
+
+
+ + ♻ ☆ Generative Action Description Prompts for Skeleton-based Action + Recognition ICCV23 + + +
+ Skeleton-based action recognition has recently received considerable +attention. Current approaches to skeleton-based action recognition are +typically formulated as one-hot classification tasks and do not fully exploit +the semantic relations between actions. For example, "make victory sign" and +"thumb up" are two actions of hand gestures, whose major difference lies in the +movement of hands. This information is agnostic from the categorical one-hot +encoding of action classes but could be unveiled from the action description. +Therefore, utilizing action description in training could potentially benefit +representation learning. In this work, we propose a Generative +Action-description Prompts (GAP) approach for skeleton-based action +recognition. More specifically, we employ a pre-trained large-scale language +model as the knowledge engine to automatically generate text descriptions for +body parts movements of actions, and propose a multi-modal training scheme by +utilizing the text encoder to generate feature vectors for different body parts +and supervise the skeleton encoder for action representation learning. +Experiments show that our proposed GAP method achieves noticeable improvements +over various baseline models without extra computation cost at inference. GAP +achieves new state-of-the-arts on popular skeleton-based action recognition +benchmarks, including NTU RGB+D, NTU RGB+D 120 and NW-UCLA. The source code is +available at https://github.com/MartinXM/GAP. + +
+
+ comment: Accepted by ICCV23 +
+
+
+
+
+ + ♻ ☆ Refined Temporal Pyramidal Compression-and-Amplification Transformer for + 3D Human Pose Estimation + + +
+ Accurately estimating the 3D pose of humans in video sequences requires both +accuracy and a well-structured architecture. With the success of transformers, +we introduce the Refined Temporal Pyramidal Compression-and-Amplification +(RTPCA) transformer. Exploiting the temporal dimension, RTPCA extends +intra-block temporal modeling via its Temporal Pyramidal +Compression-and-Amplification (TPCA) structure and refines inter-block feature +interaction with a Cross-Layer Refinement (XLR) module. In particular, TPCA +block exploits a temporal pyramid paradigm, reinforcing key and value +representation capabilities and seamlessly extracting spatial semantics from +motion sequences. We stitch these TPCA blocks with XLR that promotes rich +semantic representation through continuous interaction of queries, keys, and +values. This strategy embodies early-stage information with current flows, +addressing typical deficits in detail and stability seen in other +transformer-based methods. We demonstrate the effectiveness of RTPCA by +achieving state-of-the-art results on Human3.6M, HumanEva-I, and MPI-INF-3DHP +benchmarks with minimal computational overhead. The source code is available at +https://github.com/hbing-l/RTPCA. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Instant Continual Learning of Neural Radiance Fields + + +
+ Neural radiance fields (NeRFs) have emerged as an effective method for +novel-view synthesis and 3D scene reconstruction. However, conventional +training methods require access to all training views during scene +optimization. This assumption may be prohibitive in continual learning +scenarios, where new data is acquired in a sequential manner and a continuous +update of the NeRF is desired, as in automotive or remote sensing applications. +When naively trained in such a continual setting, traditional scene +representation frameworks suffer from catastrophic forgetting, where previously +learned knowledge is corrupted after training on new data. Prior works in +alleviating forgetting with NeRFs suffer from low reconstruction quality and +high latency, making them impractical for real-world application. We propose a +continual learning framework for training NeRFs that leverages replay-based +methods combined with a hybrid explicit--implicit scene representation. Our +method outperforms previous methods in reconstruction quality when trained in a +continual setting, while having the additional benefit of being an order of +magnitude faster. + +
+
+ comment: For project page please visit https://ryanpo.com/icngp/ +
+
+
+
+
+ + ♻ ☆ Image Labels Are All You Need for Coarse Seagrass Segmentation + + +
+ Seagrass meadows serve as critical carbon sinks, but estimating the amount of +carbon they store requires knowledge of the seagrass species present. +Underwater and surface vehicles equipped with machine learning algorithms can +help to accurately estimate the composition and extent of seagrass meadows at +scale. However, previous approaches for seagrass detection and classification +have required supervision from patch-level labels. In this paper, we reframe +seagrass classification as a weakly supervised coarse segmentation problem +where image-level labels are used during training (25 times fewer labels +compared to patch-level labeling) and patch-level outputs are obtained at +inference time. To this end, we introduce SeaFeats, an architecture that uses +unsupervised contrastive pre-training and feature similarity, and SeaCLIP, a +model that showcases the effectiveness of large language models as a +supervisory signal in domain-specific applications. We demonstrate that an +ensemble of SeaFeats and SeaCLIP leads to highly robust performance. Our method +outperforms previous approaches that require patch-level labels on the +multi-species 'DeepSeagrass' dataset by 6.8% (absolute) for the class-weighted +F1 score, and by 12.1% (absolute) for the seagrass presence/absence F1 score on +the 'Global Wetlands' dataset. We also present two case studies for real-world +deployment: outlier detection on the Global Wetlands dataset, and application +of our method on imagery collected by the FloatyBoat autonomous surface +vehicle. + +
+
+ comment: 10 pages, 4 figures, additional 3 pages of supplementary material +
+
+
+
+
+ + ♻ ☆ 3D Neural Embedding Likelihood: Probabilistic Inverse Graphics for + Robust 6D Pose Estimation ICCV 2023 + + +
+ The ability to perceive and understand 3D scenes is crucial for many +applications in computer vision and robotics. Inverse graphics is an appealing +approach to 3D scene understanding that aims to infer the 3D scene structure +from 2D images. In this paper, we introduce probabilistic modeling to the +inverse graphics framework to quantify uncertainty and achieve robustness in 6D +pose estimation tasks. Specifically, we propose 3D Neural Embedding Likelihood +(3DNEL) as a unified probabilistic model over RGB-D images, and develop +efficient inference procedures on 3D scene descriptions. 3DNEL effectively +combines learned neural embeddings from RGB with depth information to improve +robustness in sim-to-real 6D object pose estimation from RGB-D images. +Performance on the YCB-Video dataset is on par with state-of-the-art yet is +much more robust in challenging regimes. In contrast to discriminative +approaches, 3DNEL's probabilistic generative formulation jointly models +multiple objects in a scene, quantifies uncertainty in a principled way, and +handles object pose tracking under heavy occlusion. Finally, 3DNEL provides a +principled framework for incorporating prior knowledge about the scene and +objects, which allows natural extension to additional tasks like camera pose +tracking from video. + +
+
+ comment: ICCV 2023 camera ready +
+
+
+
+
+ + ♻ ☆ Event-based Human Pose Tracking by Spiking Spatiotemporal Transformer + + +
+ Event camera, as an emerging biologically-inspired vision sensor for +capturing motion dynamics, presents new potential for 3D human pose tracking, +or video-based 3D human pose estimation. However, existing works in pose +tracking either require the presence of additional gray-scale images to +establish a solid starting pose, or ignore the temporal dependencies all +together by collapsing segments of event streams to form static event frames. +Meanwhile, although the effectiveness of Artificial Neural Networks (ANNs, +a.k.a. dense deep learning) has been showcased in many event-based tasks, the +use of ANNs tends to neglect the fact that compared to the dense frame-based +image sequences, the occurrence of events from an event camera is +spatiotemporally much sparser. Motivated by the above mentioned issues, we +present in this paper a dedicated end-to-end sparse deep learning approach for +event-based pose tracking: 1) to our knowledge this is the first time that 3D +human pose tracking is obtained from events only, thus eliminating the need of +accessing to any frame-based images as part of input; 2) our approach is based +entirely upon the framework of Spiking Neural Networks (SNNs), which consists +of Spike-Element-Wise (SEW) ResNet and a novel Spiking Spatiotemporal +Transformer; 3) a large-scale synthetic dataset is constructed that features a +broad and diverse set of annotated 3D human motions, as well as longer hours of +event stream data, named SynEventHPD. Empirical experiments demonstrate that, +with superior performance over the state-of-the-art (SOTA) ANNs counterparts, +our approach also achieves a significant computation reduction of 80% in FLOPS. +Furthermore, our proposed method also outperforms SOTA SNNs in the regression +task of human pose tracking. Our implementation is available at +https://github.com/JimmyZou/HumanPoseTracking_SNN and dataset will be released +upon paper acceptance. + +
+
+
+
+
+ + ♻ ☆ Event-based Stereo Visual Odometry with Native Temporal Resolution via + Continuous-time Gaussian Process Regression + + +
+ Event-based cameras asynchronously capture individual visual changes in a +scene. This makes them more robust than traditional frame-based cameras to +highly dynamic motions and poor illumination. It also means that every +measurement in a scene can occur at a unique time. + Handling these different measurement times is a major challenge of using +event-based cameras. It is often addressed in visual odometry (VO) pipelines by +approximating temporally close measurements as occurring at one common time. +This grouping simplifies the estimation problem but, absent additional sensors, +sacrifices the inherent temporal resolution of event-based cameras. + This paper instead presents a complete stereo VO pipeline that estimates +directly with individual event-measurement times without requiring any grouping +or approximation in the estimation state. It uses continuous-time trajectory +estimation to maintain the temporal fidelity and asynchronous nature of +event-based cameras through Gaussian process regression with a physically +motivated prior. Its performance is evaluated on the MVSEC dataset, where it +achieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences, +outperforming the existing publicly available event-based stereo VO pipeline by +two and four times, respectively. + +
+
+ comment: To appear in IEEE Robotics and Automation Letters (RA-L). 8 pages, 4 + figures. DOI: 10.1109/LRA.2023.3311374 +
+
+
+
+
+ + ♻ ☆ A Sparse Graph Formulation for Efficient Spectral Image Segmentation + + +
+ Spectral Clustering is one of the most traditional methods to solve +segmentation problems. Based on Normalized Cuts, it aims at partitioning an +image using an objective function defined by a graph. Despite their +mathematical attractiveness, spectral approaches are traditionally neglected by +the scientific community due to their practical issues and underperformance. In +this paper, we adopt a sparse graph formulation based on the inclusion of extra +nodes to a simple grid graph. While the grid encodes the pixel spatial +disposition, the extra nodes account for the pixel color data. Applying the +original Normalized Cuts algorithm to this graph leads to a simple and scalable +method for spectral image segmentation, with an interpretable solution. Our +experiments also demonstrate that our proposed methodology over performs both +traditional and modern unsupervised algorithms for segmentation in both real +and synthetic data. + +
+
+
+
+
+ + ♻ ☆ Learning Representations that Support Extrapolation ICML 2020 + + +
+ Extrapolation -- the ability to make inferences that go beyond the scope of +one's experiences -- is a hallmark of human intelligence. By contrast, the +generalization exhibited by contemporary neural network algorithms is largely +limited to interpolation between data points in their training corpora. In this +paper, we consider the challenge of learning representations that support +extrapolation. We introduce a novel visual analogy benchmark that allows the +graded evaluation of extrapolation as a function of distance from the convex +domain defined by the training data. We also introduce a simple technique, +temporal context normalization, that encourages representations that emphasize +the relations between objects. We find that this technique enables a +significant improvement in the ability to extrapolate, considerably +outperforming a number of competitive techniques. + +
+
+ comment: ICML 2020 +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Impression-Informed Multi-Behavior Recommender System: A Hierarchical + Graph Attention Approach + + +
+ While recommender systems have significantly benefited from implicit +feedback, they have often missed the nuances of multi-behavior interactions +between users and items. Historically, these systems either amalgamated all +behaviors, such as \textit{impression} (formerly \textit{view}), +\textit{add-to-cart}, and \textit{buy}, under a singular 'interaction' label, +or prioritized only the target behavior, often the \textit{buy} action, +discarding valuable auxiliary signals. Although recent advancements tried +addressing this simplification, they primarily gravitated towards optimizing +the target behavior alone, battling with data scarcity. Additionally, they +tended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these +gaps, we introduce the \textbf{H}ierarchical \textbf{M}ulti-behavior +\textbf{G}raph Attention \textbf{N}etwork (HMGN). This pioneering framework +leverages attention mechanisms to discern information from both inter and +intra-behaviors while employing a multi-task Hierarchical Bayesian Personalized +Ranking (HBPR) for optimization. Recognizing the need for scalability, our +approach integrates a specialized multi-behavior sub-graph sampling technique. +Moreover, the adaptability of HMGN allows for the seamless inclusion of +knowledge metadata and time-series data. Empirical results attest to our +model's prowess, registering a notable performance boost of up to 64\% in +NDCG@100 metrics over conventional graph neural network methods. + +
+
+
+
+
+ + ☆ Helper Recommendation with seniority control in Online Health Community + + +
+ Online health communities (OHCs) are forums where patients with similar +conditions communicate their experiences and provide moral support. Social +support in OHCs plays a crucial role in easing and rehabilitating patients. +However, many time-sensitive questions from patients often remain unanswered +due to the multitude of threads and the random nature of patient visits in +OHCs. To address this issue, it is imperative to propose a recommender system +that assists solution seekers in finding appropriate problem helpers. +Nevertheless, developing a recommendation algorithm to enhance social support +in OHCs remains an under-explored area. Traditional recommender systems cannot +be directly adapted due to the following obstacles. First, unlike user-item +links in traditional recommender systems, it is hard to model the social +support behind helper-seeker links in OHCs since they are formed based on +various heterogeneous reasons. Second, it is difficult to distinguish the +impact of historical activities in characterizing patients. Third, it is +significantly challenging to ensure that the recommended helpers possess +sufficient expertise to assist the seekers. To tackle the aforementioned +challenges, we develop a Monotonically regularIzed diseNTangled Variational +Autoencoders (MINT) model to strengthen social support in OHCs. + +
+
+
+
+
+ + ☆ Prompt-based Effective Input Reformulation for Legal Case Retrieval + + +
+ Legal case retrieval plays an important role for legal practitioners to +effectively retrieve relevant cases given a query case. Most existing neural +legal case retrieval models directly encode the whole legal text of a case to +generate a case representation, which is then utilised to conduct a nearest +neighbour search for retrieval. Although these straightforward methods have +achieved improvement over conventional statistical methods in retrieval +accuracy, two significant challenges are identified in this paper: (1) Legal +feature alignment: the usage of the whole case text as the input will generally +incorporate redundant and noisy information because, from the legal +perspective, the determining factor of relevant cases is the alignment of key +legal features instead of whole text matching; (2) Legal context preservation: +furthermore, since the existing text encoding models usually have an input +length limit shorter than the case, the whole case text needs to be truncated +or divided into paragraphs, which leads to the loss of the global context of +legal information. In this paper, a novel legal case retrieval framework, +PromptCase, is proposed to tackle these challenges. Firstly, legal facts and +legal issues are identified and formally defined as the key features +facilitating legal case retrieval based on a thorough study of the definition +of relevant cases from a legal perspective. Secondly, with the determining +legal features, a prompt-based encoding scheme is designed to conduct an +effective encoding with language models. Extensive zero-shot experiments have +been conducted on two benchmark datasets in legal case retrieval, which +demonstrate the superior retrieval effectiveness of the proposed PromptCase. +The code has been released on https://github.com/yanran-tang/PromptCase. + +
+
+
+
+
+ + ♻ ☆ DynED: Dynamic Ensemble Diversification in Data Stream Classification CIKM '23 + + +
+ Ensemble methods are commonly used in classification due to their remarkable +performance. Achieving high accuracy in a data stream environment is a +challenging task considering disruptive changes in the data distribution, also +known as concept drift. A greater diversity of ensemble components is known to +enhance prediction accuracy in such settings. Despite the diversity of +components within an ensemble, not all contribute as expected to its overall +performance. This necessitates a method for selecting components that exhibit +high performance and diversity. We present a novel ensemble construction and +maintenance approach based on MMR (Maximal Marginal Relevance) that dynamically +combines the diversity and prediction accuracy of components during the process +of structuring an ensemble. The experimental results on both four real and 11 +synthetic datasets demonstrate that the proposed approach (DynED) provides a +higher average mean accuracy compared to the five state-of-the-art baselines. + +
+
+ comment: Proceedings of the 32nd ACM International Conference on Information + and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United + Kingdom +
+
+
+
+
+ + ♻ ☆ A Diffusion model for POI recommendation + + +
+ Next Point-of-Interest (POI) recommendation is a critical task in +location-based services that aim to provide personalized suggestions for the +user's next destination. Previous works on POI recommendation have laid focused +on modeling the user's spatial preference. However, existing works that +leverage spatial information are only based on the aggregation of users' +previous visited positions, which discourages the model from recommending POIs +in novel areas. This trait of position-based methods will harm the model's +performance in many situations. Additionally, incorporating sequential +information into the user's spatial preference remains a challenge. In this +paper, we propose Diff-POI: a Diffusion-based model that samples the user's +spatial preference for the next POI recommendation. Inspired by the wide +application of diffusion algorithm in sampling from distributions, Diff-POI +encodes the user's visiting sequence and spatial character with two +tailor-designed graph encoding modules, followed by a diffusion-based sampling +strategy to explore the user's spatial visiting trends. We leverage the +diffusion process and its reversed form to sample from the posterior +distribution and optimized the corresponding score function. We design a joint +training and inference framework to optimize and evaluate the proposed +Diff-POI. Extensive experiments on four real-world POI recommendation datasets +demonstrate the superiority of our Diff-POI over state-of-the-art baseline +methods. Further ablation and parameter studies on Diff-POI reveal the +functionality and effectiveness of the proposed diffusion-based sampling +strategy for addressing the limitations of existing methods. + +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Exploratory Learning-Aided Community Detection + in Networks with Unknown Topology CIKM 2022 + + +
+ In social networks, the discovery of community structures has received +considerable attention as a fundamental problem in various network analysis +tasks. However, due to privacy concerns or access restrictions, the network +structure is often unknown, thereby rendering established community detection +approaches ineffective without costly network topology acquisition. To tackle +this challenge, we present META-CODE, a unified framework for detecting +overlapping communities in networks with unknown topology via exploratory +learning aided by easy-to-collect node metadata. Specifically, META-CODE +consists of three iterative steps in addition to the initial network inference +step: 1) node-level community-affiliation embeddings based on graph neural +networks (GNNs) trained by our new reconstruction loss, 2) network exploration +via community-affiliation-based node queries, and 3) network inference using an +edge connectivity-based Siamese neural network model from the explored network. +Through extensive experiments on five real-world datasets including two large +networks, we demonstrated: (a) the superiority of META-CODE over benchmark +community detection methods, achieving remarkable gains up to 151.27% compared +to the best existing competitor, (b) the impact of each module in META-CODE, +(c) the effectiveness of node queries in META-CODE based on empirical +evaluations and theoretical findings, (d) the convergence of the inferred +network, and (e) the computational efficiency of META-CODE. + +
+
+ comment: 16 pages, 9 figures, 6 tables; its conference version was presented + at the ACM International Conference on Information and Knowledge Management + (CIKM 2022) +
+
+
+
+
+ + ♻ ☆ Ducho: A Unified Framework for the Extraction of Multimodal Features in + Recommendation + + +
+ In multimodal-aware recommendation, the extraction of meaningful multimodal +features is at the basis of high-quality recommendations. Generally, each +recommendation framework implements its multimodal extraction procedures with +specific strategies and tools. This is limiting for two reasons: (i) different +extraction strategies do not ease the interdependence among multimodal +recommendation frameworks; thus, they cannot be efficiently and fairly +compared; (ii) given the large plethora of pre-trained deep learning models +made available by different open source tools, model designers do not have +access to shared interfaces to extract features. Motivated by the outlined +aspects, we propose \framework, a unified framework for the extraction of +multimodal features in recommendation. By integrating three widely-adopted deep +learning libraries as backends, namely, TensorFlow, PyTorch, and Transformers, +we provide a shared interface to extract and process features where each +backend's specific methods are abstracted to the end user. Noteworthy, the +extraction pipeline is easily configurable with a YAML-based file where the +user can specify, for each modality, the list of models (and their specific +backends/parameters) to perform the extraction. Finally, to make \framework +accessible to the community, we build a public Docker image equipped with a +ready-to-use CUDA environment and propose three demos to test its +functionalities for different scenarios and tasks. The GitHub repository and +the documentation are accessible at this link: +https://github.com/sisinflab/Ducho. + +
+
+
+
+
+ + ♻ ☆ A Survey of Multimodal Information Fusion for Smart Healthcare: Mapping + the Journey from Data to Wisdom + + +
+ Multimodal medical data fusion has emerged as a transformative approach in +smart healthcare, enabling a comprehensive understanding of patient health and +personalized treatment plans. In this paper, a journey from data to information +to knowledge to wisdom (DIKW) is explored through multimodal fusion for smart +healthcare. We present a comprehensive review of multimodal medical data fusion +focused on the integration of various data modalities. The review explores +different approaches such as feature selection, rule-based systems, machine +learning, deep learning, and natural language processing, for fusing and +analyzing multimodal data. This paper also highlights the challenges associated +with multimodal fusion in healthcare. By synthesizing the reviewed frameworks +and theories, it proposes a generic framework for multimodal medical data +fusion that aligns with the DIKW model. Moreover, it discusses future +directions related to the four pillars of healthcare: Predictive, Preventive, +Personalized, and Participatory approaches. The components of the comprehensive +survey presented in this paper form the foundation for more successful +implementation of multimodal fusion in smart healthcare. Our findings can guide +researchers and practitioners in leveraging the power of multimodal fusion with +the state-of-the-art approaches to revolutionize healthcare and improve patient +outcomes. + +
+
+ comment: This work has been submitted to the ELSEVIER for possible + publication. Copyright may be transferred without notice, after which this + version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ MvFS: Multi-view Feature Selection for Recommender System CIKM 2023 + + +
+ Feature selection, which is a technique to select key features in recommender +systems, has received increasing research attention. Recently, Adaptive Feature +Selection (AdaFS) has shown remarkable performance by adaptively selecting +features for each data instance, considering that the importance of a given +feature field can vary significantly across data. However, this method still +has limitations in that its selection process could be easily biased to major +features that frequently occur. To address these problems, we propose +Multi-view Feature Selection (MvFS), which selects informative features for +each instance more effectively. Most importantly, MvFS employs a multi-view +network consisting of multiple sub-networks, each of which learns to measure +the feature importance of a part of data with different feature patterns. By +doing so, MvFS mitigates the bias problem towards dominant patterns and +promotes a more balanced feature selection process. Moreover, MvFS adopts an +effective importance score modeling strategy which is applied independently to +each field without incurring dependency among features. Experimental results on +real-world datasets demonstrate the effectiveness of MvFS compared to +state-of-the-art baselines. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+
+
+
+ + Machine Learning 149 + +
+
+
+ + ☆ Matcha-TTS: A fast TTS architecture with conditional flow matching ICASSP 2024 + + +
+ We introduce Matcha-TTS, a new encoder-decoder architecture for speedy TTS +acoustic modelling, trained using optimal-transport conditional flow matching +(OT-CFM). This yields an ODE-based decoder capable of high output quality in +fewer synthesis steps than models trained using score matching. Careful design +choices additionally ensure each synthesis step is fast to run. The method is +probabilistic, non-autoregressive, and learns to speak from scratch without +external alignments. Compared to strong pre-trained baseline models, the +Matcha-TTS system has the smallest memory footprint, rivals the speed of the +fastest models on long utterances, and attains the highest mean opinion score +in a listening test. Please see https://shivammehta25.github.io/Matcha-TTS/ for +audio examples, code, and pre-trained models. + +
+
+ comment: 5 pages, 3 figures. Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Blink: Link Local Differential Privacy in Graph Neural Networks via + Bayesian Estimation CCS 2023 + + +
+ Graph neural networks (GNNs) have gained an increasing amount of popularity +due to their superior capability in learning node embeddings for various graph +inference tasks, but training them can raise privacy concerns. To address this, +we propose using link local differential privacy over decentralized nodes, +enabling collaboration with an untrusted server to train GNNs without revealing +the existence of any link. Our approach spends the privacy budget separately on +links and degrees of the graph for the server to better denoise the graph +topology using Bayesian estimation, alleviating the negative impact of LDP on +the accuracy of the trained GNNs. We bound the mean absolute error of the +inferred link probabilities against the ground truth graph topology. We then +propose two variants of our LDP mechanism complementing each other in different +privacy settings, one of which estimates fewer links under lower privacy +budgets to avoid false positive link estimates when the uncertainty is high, +while the other utilizes more information and performs better given relatively +higher privacy budgets. Furthermore, we propose a hybrid variant that combines +both strategies and is able to perform better across different privacy budgets. +Extensive experiments show that our approach outperforms existing methods in +terms of accuracy under varying privacy budgets. + +
+
+ comment: 17 pages, accepted by ACM CCS 2023 as a conference paper +
+
+
+
+
+ + ☆ SLiMe: Segment Like Me + + +
+ Significant strides have been made using large vision-language models, like +Stable Diffusion (SD), for a variety of downstream tasks, including image +editing, image correspondence, and 3D shape generation. Inspired by these +advancements, we explore leveraging these extensive vision-language models for +segmenting images at any desired granularity using as few as one annotated +sample by proposing SLiMe. SLiMe frames this problem as an optimization task. +Specifically, given a single training image and its segmentation mask, we first +extract attention maps, including our novel "weighted accumulated +self-attention map" from the SD prior. Then, using the extracted attention +maps, the text embeddings of Stable Diffusion are optimized such that, each of +them, learn about a single segmented region from the training image. These +learned embeddings then highlight the segmented region in the attention maps, +which in turn can then be used to derive the segmentation map. This enables +SLiMe to segment any real-world image during inference with the granularity of +the segmented region in the training image, using just one example. Moreover, +leveraging additional training data when available, i.e. few-shot, improves the +performance of SLiMe. We carried out a knowledge-rich set of experiments +examining various design factors and showed that SLiMe outperforms other +existing one-shot and few-shot segmentation methods. + +
+
+
+
+
+ + ☆ 3D Object Positioning Using Differentiable Multimodal Learning + + +
+ This article describes a multi-modal method using simulated Lidar data via +ray tracing and image pixel loss with differentiable rendering to optimize an +object's position with respect to an observer or some referential objects in a +computer graphics scene. Object position optimization is completed using +gradient descent with the loss function being influenced by both modalities. +Typical object placement optimization is done using image pixel loss with +differentiable rendering only, this work shows the use of a second modality +(Lidar) leads to faster convergence. This method of fusing sensor input +presents a potential usefulness for autonomous vehicles, as these methods can +be used to establish the locations of multiple actors in a scene. This article +also presents a method for the simulation of multiple types of data to be used +in the training of autonomous vehicles. + +
+
+ comment: 7 pages, 8 figures +
+
+
+
+
+ + ☆ GPT-InvestAR: Enhancing Stock Investment Strategies through Annual + Report Analysis with Large Language Models + + +
+ Annual Reports of publicly listed companies contain vital information about +their financial health which can help assess the potential impact on Stock +price of the firm. These reports are comprehensive in nature, going up to, and +sometimes exceeding, 100 pages. Analysing these reports is cumbersome even for +a single firm, let alone the whole universe of firms that exist. Over the +years, financial experts have become proficient in extracting valuable +information from these documents relatively quickly. However, this requires +years of practice and experience. This paper aims to simplify the process of +assessing Annual Reports of all the firms by leveraging the capabilities of +Large Language Models (LLMs). The insights generated by the LLM are compiled in +a Quant styled dataset and augmented by historical stock price data. A Machine +Learning model is then trained with LLM outputs as features. The walkforward +test results show promising outperformance wrt S&P500 returns. This paper +intends to provide a framework for future work in this direction. To facilitate +this, the code has been released as open source. + +
+
+
+
+
+ + ☆ Impression-Informed Multi-Behavior Recommender System: A Hierarchical + Graph Attention Approach + + +
+ While recommender systems have significantly benefited from implicit +feedback, they have often missed the nuances of multi-behavior interactions +between users and items. Historically, these systems either amalgamated all +behaviors, such as \textit{impression} (formerly \textit{view}), +\textit{add-to-cart}, and \textit{buy}, under a singular 'interaction' label, +or prioritized only the target behavior, often the \textit{buy} action, +discarding valuable auxiliary signals. Although recent advancements tried +addressing this simplification, they primarily gravitated towards optimizing +the target behavior alone, battling with data scarcity. Additionally, they +tended to bypass the nuanced hierarchy intrinsic to behaviors. To bridge these +gaps, we introduce the \textbf{H}ierarchical \textbf{M}ulti-behavior +\textbf{G}raph Attention \textbf{N}etwork (HMGN). This pioneering framework +leverages attention mechanisms to discern information from both inter and +intra-behaviors while employing a multi-task Hierarchical Bayesian Personalized +Ranking (HBPR) for optimization. Recognizing the need for scalability, our +approach integrates a specialized multi-behavior sub-graph sampling technique. +Moreover, the adaptability of HMGN allows for the seamless inclusion of +knowledge metadata and time-series data. Empirical results attest to our +model's prowess, registering a notable performance boost of up to 64\% in +NDCG@100 metrics over conventional graph neural network methods. + +
+
+
+
+
+ + ☆ Split-Boost Neural Networks + + +
+ The calibration and training of a neural network is a complex and +time-consuming procedure that requires significant computational resources to +achieve satisfactory results. Key obstacles are a large number of +hyperparameters to select and the onset of overfitting in the face of a small +amount of data. In this framework, we propose an innovative training strategy +for feed-forward architectures - called split-boost - that improves performance +and automatically includes a regularizing behaviour without modeling it +explicitly. Such a novel approach ultimately allows us to avoid explicitly +modeling the regularization term, decreasing the total number of +hyperparameters and speeding up the tuning phase. The proposed strategy is +tested on a real-world (anonymized) dataset within a benchmark medical +insurance design problem. + +
+
+
+
+
+ + ☆ Learning to Recharge: UAV Coverage Path Planning through Deep + Reinforcement Learning + + +
+ Coverage path planning (CPP) is a critical problem in robotics, where the +goal is to find an efficient path that covers every point in an area of +interest. This work addresses the power-constrained CPP problem with recharge +for battery-limited unmanned aerial vehicles (UAVs). In this problem, a notable +challenge emerges from integrating recharge journeys into the overall coverage +strategy, highlighting the intricate task of making strategic, long-term +decisions. We propose a novel proximal policy optimization (PPO)-based deep +reinforcement learning (DRL) approach with map-based observations, utilizing +action masking and discount factor scheduling to optimize coverage trajectories +over the entire mission horizon. We further provide the agent with a position +history to handle emergent state loops caused by the recharge capability. Our +approach outperforms a baseline heuristic, generalizes to different target +zones and maps, with limited generalization to unseen maps. We offer valuable +insights into DRL algorithm design for long-horizon problems and provide a +publicly available software framework for the CPP problem. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Data-Driven Neural Polar Codes for Unknown Channels With and Without + Memory + + +
+ In this work, a novel data-driven methodology for designing polar codes for +channels with and without memory is proposed. The methodology is suitable for +the case where the channel is given as a "black-box" and the designer has +access to the channel for generating observations of its inputs and outputs, +but does not have access to the explicit channel model. The proposed method +leverages the structure of the successive cancellation (SC) decoder to devise a +neural SC (NSC) decoder. The NSC decoder uses neural networks (NNs) to replace +the core elements of the original SC decoder, the check-node, the bit-node and +the soft decision. Along with the NSC, we devise additional NN that embeds the +channel outputs into the input space of the SC decoder. The proposed method is +supported by theoretical guarantees that include the consistency of the NSC. +Also, the NSC has computational complexity that does not grow with the channel +memory size. This sets its main advantage over successive cancellation trellis +(SCT) decoder for finite state channels (FSCs) that has complexity of +$O(|\mathcal{S}|^3 N\log N)$, where $|\mathcal{S}|$ denotes the number of +channel states. We demonstrate the performance of the proposed algorithms on +memoryless channels and on channels with memory. The empirical results are +compared with the optimal polar decoder, given by the SC and SCT decoders. We +further show that our algorithms are applicable for the case where there SC and +SCT decoders are not applicable. + +
+
+
+
+
+ + ☆ The Best Arm Evades: Near-optimal Multi-pass Streaming Lower Bounds for + Pure Exploration in Multi-armed Bandits + + +
+ We give a near-optimal sample-pass trade-off for pure exploration in +multi-armed bandits (MABs) via multi-pass streaming algorithms: any streaming +algorithm with sublinear memory that uses the optimal sample complexity of +$O(\frac{n}{\Delta^2})$ requires +$\Omega(\frac{\log{(1/\Delta)}}{\log\log{(1/\Delta)}})$ passes. Here, $n$ is +the number of arms and $\Delta$ is the reward gap between the best and the +second-best arms. Our result matches the $O(\log(\frac{1}{\Delta}))$-pass +algorithm of Jin et al. [ICML'21] (up to lower order terms) that only uses +$O(1)$ memory and answers an open question posed by Assadi and Wang [STOC'20]. + +
+
+
+
+
+ + ☆ Using Multiple Vector Channels Improves E(n)-Equivariant Graph Neural + Networks + + +
+ We present a natural extension to E(n)-equivariant graph neural networks that +uses multiple equivariant vectors per node. We formulate the extension and show +that it improves performance across different physical systems benchmark tasks, +with minimal differences in runtime or number of parameters. The proposed +multichannel EGNN outperforms the standard singlechannel EGNN on N-body charged +particle dynamics, molecular property predictions, and predicting the +trajectories of solar system bodies. Given the additional benefits and minimal +additional cost of multi-channel EGNN, we suggest that this extension may be of +practical use to researchers working in machine learning for the physical +sciences + +
+
+
+
+
+ + ☆ Detecting Manufacturing Defects in PCBs via Data-Centric Machine + Learning on Solder Paste Inspection Features + + +
+ Automated detection of defects in Printed Circuit Board (PCB) manufacturing +using Solder Paste Inspection (SPI) and Automated Optical Inspection (AOI) +machines can help improve operational efficiency and significantly reduce the +need for manual intervention. In this paper, using SPI-extracted features of 6 +million pins, we demonstrate a data-centric approach to train Machine Learning +(ML) models to detect PCB defects at three stages of PCB manufacturing. The 6 +million PCB pins correspond to 2 million components that belong to 15,387 PCBs. +Using a base extreme gradient boosting (XGBoost) ML model, we iterate on the +data pre-processing step to improve detection performance. Combining pin-level +SPI features using component and PCB IDs, we developed training instances also +at the component and PCB level. This allows the ML model to capture any +inter-pin, inter-component, or spatial effects that may not be apparent at the +pin level. Models are trained at the pin, component, and PCB levels, and the +detection results from the different models are combined to identify defective +components. + +
+
+
+
+
+ + ☆ ContrastWSD: Enhancing Metaphor Detection with Word Sense Disambiguation + Following the Metaphor Identification Procedure + + +
+ This paper presents ContrastWSD, a RoBERTa-based metaphor detection model +that integrates the Metaphor Identification Procedure (MIP) and Word Sense +Disambiguation (WSD) to extract and contrast the contextual meaning with the +basic meaning of a word to determine whether it is used metaphorically in a +sentence. By utilizing the word senses derived from a WSD model, our model +enhances the metaphor detection process and outperforms other methods that rely +solely on contextual embeddings or integrate only the basic definitions and +other external knowledge. We evaluate our approach on various benchmark +datasets and compare it with strong baselines, indicating the effectiveness in +advancing metaphor detection. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ ORL-AUDITOR: Dataset Auditing in Offline Deep Reinforcement Learning NDSS + + +
+ Data is a critical asset in AI, as high-quality datasets can significantly +improve the performance of machine learning models. In safety-critical domains +such as autonomous vehicles, offline deep reinforcement learning (offline DRL) +is frequently used to train models on pre-collected datasets, as opposed to +training these models by interacting with the real-world environment as the +online DRL. To support the development of these models, many institutions make +datasets publicly available with opensource licenses, but these datasets are at +risk of potential misuse or infringement. Injecting watermarks to the dataset +may protect the intellectual property of the data, but it cannot handle +datasets that have already been published and is infeasible to be altered +afterward. Other existing solutions, such as dataset inference and membership +inference, do not work well in the offline DRL scenario due to the diverse +model behavior characteristics and offline setting constraints. In this paper, +we advocate a new paradigm by leveraging the fact that cumulative rewards can +act as a unique identifier that distinguishes DRL models trained on a specific +dataset. To this end, we propose ORL-AUDITOR, which is the first +trajectory-level dataset auditing mechanism for offline RL scenarios. Our +experiments on multiple offline DRL models and tasks reveal the efficacy of +ORL-AUDITOR, with auditing accuracy over 95% and false positive rates less than +2.88%. We also provide valuable insights into the practical implementation of +ORL-AUDITOR by studying various parameter settings. Furthermore, we demonstrate +the auditing capability of ORL-AUDITOR on open-source datasets from Google and +DeepMind, highlighting its effectiveness in auditing published datasets. +ORL-AUDITOR is open-sourced at https://github.com/link-zju/ORL-Auditor. + +
+
+ comment: To appear in the Network and Distributed System Security Symposium + (NDSS) 2024, San Diego, CA, USA +
+
+
+
+
+ + ☆ Parameterizing pressure-temperature profiles of exoplanet atmospheres + with neural networks + + +
+ Atmospheric retrievals (AR) of exoplanets typically rely on a combination of +a Bayesian inference technique and a forward simulator to estimate atmospheric +properties from an observed spectrum. A key component in simulating spectra is +the pressure-temperature (PT) profile, which describes the thermal structure of +the atmosphere. Current AR pipelines commonly use ad hoc fitting functions here +that limit the retrieved PT profiles to simple approximations, but still use a +relatively large number of parameters. In this work, we introduce a +conceptually new, data-driven parameterization scheme for physically consistent +PT profiles that does not require explicit assumptions about the functional +form of the PT profiles and uses fewer parameters than existing methods. Our +approach consists of a latent variable model (based on a neural network) that +learns a distribution over functions (PT profiles). Each profile is represented +by a low-dimensional vector that can be used to condition a decoder network +that maps $P$ to $T$. When training and evaluating our method on two publicly +available datasets of self-consistent PT profiles, we find that our method +achieves, on average, better fit quality than existing baseline methods, +despite using fewer parameters. In an AR based on existing literature, our +model (using two parameters) produces a tighter, more accurate posterior for +the PT profile than the five-parameter polynomial baseline, while also speeding +up the retrieval by more than a factor of three. By providing parametric access +to physically consistent PT profiles, and by reducing the number of parameters +required to describe a PT profile (thereby reducing computational cost or +freeing resources for additional parameters of interest), our method can help +improve AR and thus our understanding of exoplanet atmospheres and their +habitability. + +
+
+ comment: Accepted for publication in Astronomy & Astrophysics +
+
+
+
+
+ + ☆ Character Queries: A Transformer-based Approach to On-Line Handwritten + Character Segmentation ICDAR 2023 + + +
+ On-line handwritten character segmentation is often associated with +handwriting recognition and even though recognition models include mechanisms +to locate relevant positions during the recognition process, it is typically +insufficient to produce a precise segmentation. Decoupling the segmentation +from the recognition unlocks the potential to further utilize the result of the +recognition. We specifically focus on the scenario where the transcription is +known beforehand, in which case the character segmentation becomes an +assignment problem between sampling points of the stylus trajectory and +characters in the text. Inspired by the $k$-means clustering algorithm, we view +it from the perspective of cluster assignment and present a Transformer-based +architecture where each cluster is formed based on a learned character query in +the Transformer decoder block. In order to assess the quality of our approach, +we create character segmentation ground truths for two popular on-line +handwriting datasets, IAM-OnDB and HANDS-VNOnDB, and evaluate multiple methods +on them, demonstrating that our approach achieves the overall best results. + +
+
+ comment: ICDAR 2023 Best Student Paper Award. Code available at + https://github.com/jungomi/character-queries +
+
+
+
+
+ + ☆ Learning Active Subspaces for Effective and Scalable Uncertainty + Quantification in Deep Neural Networks + + +
+ Bayesian inference for neural networks, or Bayesian deep learning, has the +potential to provide well-calibrated predictions with quantified uncertainty +and robustness. However, the main hurdle for Bayesian deep learning is its +computational complexity due to the high dimensionality of the parameter space. +In this work, we propose a novel scheme that addresses this limitation by +constructing a low-dimensional subspace of the neural network +parameters-referred to as an active subspace-by identifying the parameter +directions that have the most significant influence on the output of the neural +network. We demonstrate that the significantly reduced active subspace enables +effective and scalable Bayesian inference via either Monte Carlo (MC) sampling +methods, otherwise computationally intractable, or variational inference. +Empirically, our approach provides reliable predictions with robust uncertainty +estimates for various regression tasks. + +
+
+
+
+
+ + ☆ CoLA: Exploiting Compositional Structure for Automatic and Efficient + Numerical Linear Algebra + + +
+ Many areas of machine learning and science involve large linear algebra +problems, such as eigendecompositions, solving linear systems, computing matrix +exponentials, and trace estimation. The matrices involved often have Kronecker, +convolutional, block diagonal, sum, or product structure. In this paper, we +propose a simple but general framework for large-scale linear algebra problems +in machine learning, named CoLA (Compositional Linear Algebra). By combining a +linear operator abstraction with compositional dispatch rules, CoLA +automatically constructs memory and runtime efficient numerical algorithms. +Moreover, CoLA provides memory efficient automatic differentiation, low +precision computation, and GPU acceleration in both JAX and PyTorch, while also +accommodating new objects, operations, and rules in downstream packages via +multiple dispatch. CoLA can accelerate many algebraic operations, while making +it easy to prototype matrix structures and algorithms, providing an appealing +drop-in tool for virtually any computational effort that requires linear +algebra. We showcase its efficacy across a broad range of applications, +including partial differential equations, Gaussian processes, equivariant model +construction, and unsupervised learning. + +
+
+ comment: Code available at https://github.com/wilson-labs/cola +
+
+
+
+
+ + ☆ Automated CVE Analysis for Threat Prioritization and Impact Prediction + + +
+ The Common Vulnerabilities and Exposures (CVE) are pivotal information for +proactive cybersecurity measures, including service patching, security +hardening, and more. However, CVEs typically offer low-level, product-oriented +descriptions of publicly disclosed cybersecurity vulnerabilities, often lacking +the essential attack semantic information required for comprehensive weakness +characterization and threat impact estimation. This critical insight is +essential for CVE prioritization and the identification of potential +countermeasures, particularly when dealing with a large number of CVEs. Current +industry practices involve manual evaluation of CVEs to assess their attack +severities using the Common Vulnerability Scoring System (CVSS) and mapping +them to Common Weakness Enumeration (CWE) for potential mitigation +identification. Unfortunately, this manual analysis presents a major bottleneck +in the vulnerability analysis process, leading to slowdowns in proactive +cybersecurity efforts and the potential for inaccuracies due to human errors. +In this research, we introduce our novel predictive model and tool (called +CVEDrill) which revolutionizes CVE analysis and threat prioritization. CVEDrill +accurately estimates the CVSS vector for precise threat mitigation and priority +ranking and seamlessly automates the classification of CVEs into the +appropriate CWE hierarchy classes. By harnessing CVEDrill, organizations can +now implement cybersecurity countermeasure mitigation with unparalleled +accuracy and timeliness, surpassing in this domain the capabilities of +state-of-the-art tools like ChaptGPT. + +
+
+
+
+
+ + ☆ Deep Learning for Polycystic Kidney Disease: Utilizing Neural Networks + for Accurate and Early Detection through Gene Expression Analysis + + +
+ With Polycystic Kidney Disease (PKD) potentially leading to fatal +complications in patients due to the formation of cysts in the kidneys, early +detection of PKD is crucial for effective management of the condition. However, +the various patient-specific factors that play a role in the diagnosis make it +an intricate puzzle for clinicians to solve. Therefore, in this study, we aim +to utilize a deep learning-based approach for early disease detection. The +devised neural network can achieve accurate and robust predictions for possible +PKD in patients by analyzing patient gene expressions. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Universal Preprocessing Operators for Embedding Knowledge Graphs with + Literals ISWC 2023 + + +
+ Knowledge graph embeddings are dense numerical representations of entities in +a knowledge graph (KG). While the majority of approaches concentrate only on +relational information, i.e., relations between entities, fewer approaches +exist which also take information about literal values (e.g., textual +descriptions or numerical information) into account. Those which exist are +typically tailored towards a particular modality of literal and a particular +embedding method. In this paper, we propose a set of universal preprocessing +operators which can be used to transform KGs with literals for numerical, +temporal, textual, and image information, so that the transformed KGs can be +embedded with any method. The results on the kgbench dataset with three +different embedding methods show promising results. + +
+
+ comment: Accepted for DL4KG Workshop at ISWC 2023 +
+
+
+
+
+ + ☆ Amortised Inference in Bayesian Neural Networks + + +
+ Meta-learning is a framework in which machine learning models train over a +set of datasets in order to produce predictions on new datasets at test time. +Probabilistic meta-learning has received an abundance of attention from the +research community in recent years, but a problem shared by many existing +probabilistic meta-models is that they require a very large number of datasets +in order to produce high-quality predictions with well-calibrated uncertainty +estimates. In many applications, however, such quantities of data are simply +not available. + In this dissertation we present a significantly more data-efficient approach +to probabilistic meta-learning through per-datapoint amortisation of inference +in Bayesian neural networks, introducing the Amortised Pseudo-Observation +Variational Inference Bayesian Neural Network (APOVI-BNN). First, we show that +the approximate posteriors obtained under our amortised scheme are of similar +or better quality to those obtained through traditional variational inference, +despite the fact that the amortised inference is performed in a single forward +pass. We then discuss how the APOVI-BNN may be viewed as a new member of the +neural process family, motivating the use of neural process training objectives +for potentially better predictive performance on complex problems as a result. +Finally, we assess the predictive performance of the APOVI-BNN against other +probabilistic meta-models in both a one-dimensional regression problem and in a +significantly more complex image completion setting. In both cases, when the +amount of training data is limited, our model is the best in its class. + +
+
+ comment: This thesis served as the author's final project report for the + University of Cambridge part IIB Engineering Tripos. 37 pages, 7 figures +
+
+
+
+
+ + ☆ SymED: Adaptive and Online Symbolic Representation of Data on the Edge + + +
+ The edge computing paradigm helps handle the Internet of Things (IoT) +generated data in proximity to its source. Challenges occur in transferring, +storing, and processing this rapidly growing amount of data on +resource-constrained edge devices. Symbolic Representation (SR) algorithms are +promising solutions to reduce the data size by converting actual raw data into +symbols. Also, they allow data analytics (e.g., anomaly detection and trend +prediction) directly on symbols, benefiting large classes of edge applications. +However, existing SR algorithms are centralized in design and work offline with +batch data, which is infeasible for real-time cases. We propose SymED - +Symbolic Edge Data representation method, i.e., an online, adaptive, and +distributed approach for symbolic representation of data on edge. SymED is +based on the Adaptive Brownian Bridge-based Aggregation (ABBA), where we assume +low-powered IoT devices do initial data compression (senders) and the more +robust edge devices do the symbolic conversion (receivers). We evaluate SymED +by measuring compression performance, reconstruction accuracy through Dynamic +Time Warping (DTW) distance, and computational latency. The results show that +SymED is able to (i) reduce the raw data with an average compression rate of +9.5%; (ii) keep a low reconstruction error of 13.25 in the DTW space; (iii) +simultaneously provide real-time adaptability for online streaming IoT data at +typical latencies of 42ms per symbol, reducing the overall network traffic. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Theoretical Explanation of Activation Sparsity through Flat Minima and + Adversarial Robustness + + +
+ A recent empirical observation of activation sparsity in MLP layers offers an +opportunity to drastically reduce computation costs for free. Despite several +works attributing it to training dynamics, the theoretical explanation of +activation sparsity's emergence is restricted to shallow networks, small +training steps well as modified training, even though the sparsity has been +found in deep models trained by vanilla protocols for large steps. To fill the +three gaps, we propose the notion of gradient sparsity as the source of +activation sparsity and a theoretical explanation based on it that explains +gradient sparsity and then activation sparsity as necessary steps to +adversarial robustness w.r.t. hidden features and parameters, which is +approximately the flatness of minima for well-learned models. The theory +applies to standardly trained LayerNorm-ed pure MLPs, and further to +Transformers or other architectures if noises are added to weights during +training. To eliminate other sources of flatness when arguing sparsities' +necessity, we discover the phenomenon of spectral concentration, i.e., the +ratio between the largest and the smallest non-zero singular values of weight +matrices is small. We utilize random matrix theory (RMT) as a powerful +theoretical tool to analyze stochastic gradient noises and discuss the +emergence of spectral concentration. With these insights, we propose two +plug-and-play modules for both training from scratch and sparsity finetuning, +as well as one radical modification that only applies to from-scratch training. +Another under-testing module for both sparsity and flatness is also immediate +from our theories. Validational experiments are conducted to verify our +explanation. Experiments for productivity demonstrate modifications' +improvement in sparsity, indicating further theoretical cost reduction in both +training and inference. + +
+
+
+
+
+ + ☆ Natural and Robust Walking using Reinforcement Learning without + Demonstrations in High-Dimensional Musculoskeletal Models + + +
+ Humans excel at robust bipedal walking in complex natural environments. In +each step, they adequately tune the interaction of biomechanical muscle +dynamics and neuronal signals to be robust against uncertainties in ground +conditions. However, it is still not fully understood how the nervous system +resolves the musculoskeletal redundancy to solve the multi-objective control +problem considering stability, robustness, and energy efficiency. In computer +simulations, energy minimization has been shown to be a successful optimization +target, reproducing natural walking with trajectory optimization or +reflex-based control methods. However, these methods focus on particular +motions at a time and the resulting controllers are limited when compensating +for perturbations. In robotics, reinforcement learning~(RL) methods recently +achieved highly stable (and efficient) locomotion on quadruped systems, but the +generation of human-like walking with bipedal biomechanical models has required +extensive use of expert data sets. This strong reliance on demonstrations often +results in brittle policies and limits the application to new behaviors, +especially considering the potential variety of movements for high-dimensional +musculoskeletal models in 3D. Achieving natural locomotion with RL without +sacrificing its incredible robustness might pave the way for a novel approach +to studying human walking in complex natural environments. + +
+
+
+
+
+ + ☆ On the Impact of Feeding Cost Risk in Aquaculture Valuation and Decision + Making + + +
+ We study the effect of stochastic feeding costs on animal-based commodities +with particular focus on aquaculture. More specifically, we use soybean futures +to infer on the stochastic behaviour of salmon feed, which we assume to follow +a Schwartz-2-factor model. We compare the decision of harvesting salmon using a +decision rule assuming either deterministic or stochastic feeding costs, i.e. +including feeding cost risk. We identify cases, where accounting for stochastic +feeding costs leads to significant improvements as well as cases where +deterministic feeding costs are a good enough proxy. Nevertheless, in all of +these cases, the newly derived rules show superior performance, while the +additional computational costs are negligible. From a methodological point of +view, we demonstrate how to use Deep-Neural-Networks to infer on the decision +boundary that determines harvesting or continuation, improving on more +classical regression-based and curve-fitting methods. To achieve this we use a +deep classifier, which not only improves on previous results but also scales +well for higher dimensional problems, and in addition mitigates effects due to +model uncertainty, which we identify in this article. effects due to model +uncertainty, which we identify in this article. + +
+
+
+
+
+ + ☆ CR-VAE: Contrastive Regularization on Variational Autoencoders for + Preventing Posterior Collapse + + +
+ The Variational Autoencoder (VAE) is known to suffer from the phenomenon of +\textit{posterior collapse}, where the latent representations generated by the +model become independent of the inputs. This leads to degenerated +representations of the input, which is attributed to the limitations of the +VAE's objective function. In this work, we propose a novel solution to this +issue, the Contrastive Regularization for Variational Autoencoders (CR-VAE). +The core of our approach is to augment the original VAE with a contrastive +objective that maximizes the mutual information between the representations of +similar visual inputs. This strategy ensures that the information flow between +the input and its latent representation is maximized, effectively avoiding +posterior collapse. We evaluate our method on a series of visual datasets and +demonstrate, that CR-VAE outperforms state-of-the-art approaches in preventing +posterior collapse. + +
+
+
+
+
+ + ☆ M3D-NCA: Robust 3D Segmentation with Built-in Quality Control + + +
+ Medical image segmentation relies heavily on large-scale deep learning +models, such as UNet-based architectures. However, the real-world utility of +such models is limited by their high computational requirements, which makes +them impractical for resource-constrained environments such as primary care +facilities and conflict zones. Furthermore, shifts in the imaging domain can +render these models ineffective and even compromise patient safety if such +errors go undetected. To address these challenges, we propose M3D-NCA, a novel +methodology that leverages Neural Cellular Automata (NCA) segmentation for 3D +medical images using n-level patchification. Moreover, we exploit the variance +in M3D-NCA to develop a novel quality metric which can automatically detect +errors in the segmentation process of NCAs. M3D-NCA outperforms the two +magnitudes larger UNet models in hippocampus and prostate segmentation by 2% +Dice and can be run on a Raspberry Pi 4 Model B (2GB RAM). This highlights the +potential of M3D-NCA as an effective and efficient alternative for medical +image segmentation in resource-constrained environments. + +
+
+
+
+
+ + ☆ Estimating irregular water demands with physics-informed machine + learning to inform leakage detection + + +
+ Leakages in drinking water distribution networks pose significant challenges +to water utilities, leading to infrastructure failure, operational disruptions, +environmental hazards, property damage, and economic losses. The timely +identification and accurate localisation of such leakages is paramount for +utilities to mitigate these unwanted effects. However, implementation of +algorithms for leakage detection is limited in practice by requirements of +either hydraulic models or large amounts of training data. Physics-informed +machine learning can utilise hydraulic information thereby circumventing both +limitations. In this work, we present a physics-informed machine learning +algorithm that analyses pressure data and therefrom estimates unknown irregular +water demands via a fully connected neural network, ultimately leveraging the +Bernoulli equation and effectively linearising the leakage detection problem. +Our algorithm is tested on data from the L-Town benchmark network, and results +indicate a good capability for estimating most irregular demands, with R2 +larger than 0.8. Identification results for leakages under the presence of +irregular demands could be improved by a factor of 5.3 for abrupt leaks and a +factor of 3.0 for incipient leaks when compared the results disregarding +irregular demands. + +
+
+ comment: submitted to Water Research on July 17th, 2023 +
+
+
+
+
+ + ☆ GroupEnc: encoder with group loss for global structure preservation + + +
+ Recent advances in dimensionality reduction have achieved more accurate +lower-dimensional embeddings of high-dimensional data. In addition to +visualisation purposes, these embeddings can be used for downstream processing, +including batch effect normalisation, clustering, community detection or +trajectory inference. We use the notion of structure preservation at both local +and global levels to create a deep learning model, based on a variational +autoencoder (VAE) and the stochastic quartet loss from the SQuadMDS algorithm. +Our encoder model, called GroupEnc, uses a 'group loss' function to create +embeddings with less global structure distortion than VAEs do, while keeping +the model parametric and the architecture flexible. We validate our approach +using publicly available biological single-cell transcriptomic datasets, +employing RNX curves for evaluation. + +
+
+ comment: Submitted to BNAIC/BeNeLearn 2023 +
+
+
+
+
+ + ☆ Persona-aware Generative Model for Code-mixed Language + + +
+ Code-mixing and script-mixing are prevalent across online social networks and +multilingual societies. However, a user's preference toward code-mixing depends +on the socioeconomic status, demographics of the user, and the local context, +which existing generative models mostly ignore while generating code-mixed +texts. In this work, we make a pioneering attempt to develop a persona-aware +generative model to generate texts resembling real-life code-mixed texts of +individuals. We propose a Persona-aware Generative Model for Code-mixed +Generation, PARADOX, a novel Transformer-based encoder-decoder model that +encodes an utterance conditioned on a user's persona and generates code-mixed +texts without monolingual reference data. We propose an alignment module that +re-calibrates the generated sequence to resemble real-life code-mixed texts. +PARADOX generates code-mixed texts that are semantically more meaningful and +linguistically more valid. To evaluate the personification capabilities of +PARADOX, we propose four new metrics -- CM BLEU, CM Rouge-1, CM Rouge-L and CM +KS. On average, PARADOX achieves 1.6 points better CM BLEU, 47% better +perplexity and 32% better semantic coherence than the non-persona-based +counterparts. + +
+
+ comment: 4 tables, 4 figures +
+
+
+
+
+ + ☆ Ensemble DNN for Age-of-Information Minimization in UAV-assisted + Networks + + +
+ This paper addresses the problem of Age-of-Information (AoI) in UAV-assisted +networks. Our objective is to minimize the expected AoI across devices by +optimizing UAVs' stopping locations and device selection probabilities. To +tackle this problem, we first derive a closed-form expression of the expected +AoI that involves the probabilities of selection of devices. Then, we formulate +the problem as a non-convex minimization subject to quality of service +constraints. Since the problem is challenging to solve, we propose an Ensemble +Deep Neural Network (EDNN) based approach which takes advantage of the dual +formulation of the studied problem. Specifically, the Deep Neural Networks +(DNNs) in the ensemble are trained in an unsupervised manner using the +Lagrangian function of the studied problem. Our experiments show that the +proposed EDNN method outperforms traditional DNNs in reducing the expected AoI, +achieving a remarkable reduction of $29.5\%$. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ A Multimodal Learning Framework for Comprehensive 3D Mineral + Prospectivity Modeling with Jointly Learned Structure-Fluid Relationships + + +
+ This study presents a novel multimodal fusion model for three-dimensional +mineral prospectivity mapping (3D MPM), effectively integrating structural and +fluid information through a deep network architecture. Leveraging Convolutional +Neural Networks (CNN) and Multilayer Perceptrons (MLP), the model employs +canonical correlation analysis (CCA) to align and fuse multimodal features. +Rigorous evaluation on the Jiaojia gold deposit dataset demonstrates the +model's superior performance in distinguishing ore-bearing instances and +predicting mineral prospectivity, outperforming other models in result +analyses. Ablation studies further reveal the benefits of joint feature +utilization and CCA incorporation. This research not only advances mineral +prospectivity modeling but also highlights the pivotal role of data integration +and feature alignment for enhanced exploration decision-making. + +
+
+
+
+
+ + ☆ DECODE: Data-driven Energy Consumption Prediction leveraging Historical + Data and Environmental Factors in Buildings + + +
+ Energy prediction in buildings plays a crucial role in effective energy +management. Precise predictions are essential for achieving optimal energy +consumption and distribution within the grid. This paper introduces a Long +Short-Term Memory (LSTM) model designed to forecast building energy consumption +using historical energy data, occupancy patterns, and weather conditions. The +LSTM model provides accurate short, medium, and long-term energy predictions +for residential and commercial buildings compared to existing prediction +models. We compare our LSTM model with established prediction methods, +including linear regression, decision trees, and random forest. Encouragingly, +the proposed LSTM model emerges as the superior performer across all metrics. +It demonstrates exceptional prediction accuracy, boasting the highest R2 score +of 0.97 and the most favorable mean absolute error (MAE) of 0.007. An +additional advantage of our developed model is its capacity to achieve +efficient energy consumption forecasts even when trained on a limited dataset. +We address concerns about overfitting (variance) and underfitting (bias) +through rigorous training and evaluation on real-world data. In summary, our +research contributes to energy prediction by offering a robust LSTM model that +outperforms alternative methods and operates with remarkable efficiency, +generalizability, and reliability. + +
+
+ comment: 11 pages, 6 figures, 6 tables +
+
+
+
+
+ + ☆ A Unified Framework for Discovering Discrete Symmetries + + +
+ We consider the problem of learning a function respecting a symmetry from +among a class of symmetries. We develop a unified framework that enables +symmetry discovery across a broad range of subgroups including locally +symmetric, dihedral and cyclic subgroups. At the core of the framework is a +novel architecture composed of linear and tensor-valued functions that +expresses functions invariant to these subgroups in a principled manner. The +structure of the architecture enables us to leverage multi-armed bandit +algorithms and gradient descent to efficiently optimize over the linear and the +tensor-valued functions, respectively, and to infer the symmetry that is +ultimately learnt. We also discuss the necessity of the tensor-valued functions +in the architecture. Experiments on image-digit sum and polynomial regression +tasks demonstrate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Non-Clashing Teaching Maps for Balls in Graphs + + +
+ Recently, Kirkpatrick et al. [ALT 2019] and Fallat et al. [JMLR 2023] +introduced non-clashing teaching and showed it to be the most efficient machine +teaching model satisfying the benchmark for collusion-avoidance set by Goldman +and Mathias. A teaching map $T$ for a concept class $\cal{C}$ assigns a +(teaching) set $T(C)$ of examples to each concept $C \in \cal{C}$. A teaching +map is non-clashing if no pair of concepts are consistent with the union of +their teaching sets. The size of a non-clashing teaching map (NCTM) $T$ is the +maximum size of a $T(C)$, $C \in \cal{C}$. The non-clashing teaching dimension +NCTD$(\cal{C})$ of $\cal{C}$ is the minimum size of an NCTM for $\cal{C}$. +NCTM$^+$ and NCTD$^+(\cal{C})$ are defined analogously, except the teacher may +only use positive examples. + We study NCTMs and NCTM$^+$s for the concept class $\mathcal{B}(G)$ +consisting of all balls of a graph $G$. We show that the associated decision +problem {\sc B-NCTD$^+$} for NCTD$^+$ is NP-complete in split, co-bipartite, +and bipartite graphs. Surprisingly, we even prove that, unless the ETH fails, +{\sc B-NCTD$^+$} does not admit an algorithm running in time +$2^{2^{o(vc)}}\cdot n^{O(1)}$, nor a kernelization algorithm outputting a +kernel with $2^{o(vc)}$ vertices, where vc is the vertex cover number of $G$. +These are extremely rare results: it is only the second (fourth, resp.) problem +in NP to admit a double-exponential lower bound parameterized by vc (treewidth, +resp.), and only one of very few problems to admit an ETH-based conditional +lower bound on the number of vertices in a kernel. We complement these lower +bounds with matching upper bounds. For trees, interval graphs, cycles, and +trees of cycles, we derive NCTM$^+$s or NCTMs for $\mathcal{B}(G)$ of size +proportional to its VC-dimension. For Gromov-hyperbolic graphs, we design an +approximate NCTM$^+$ for $\mathcal{B}(G)$ of size 2. + +
+
+ comment: Shortened abstract due to character limit +
+
+
+
+
+ + ☆ Learning Hybrid Dynamics Models With Simulator-Informed Latent States + + +
+ Dynamics model learning deals with the task of inferring unknown dynamics +from measurement data and predicting the future behavior of the system. A +typical approach to address this problem is to train recurrent models. However, +predictions with these models are often not physically meaningful. Further, +they suffer from deteriorated behavior over time due to accumulating errors. +Often, simulators building on first principles are available being physically +meaningful by design. However, modeling simplifications typically cause +inaccuracies in these models. Consequently, hybrid modeling is an emerging +trend that aims to combine the best of both worlds. In this paper, we propose a +new approach to hybrid modeling, where we inform the latent states of a learned +model via a black-box simulator. This allows to control the predictions via the +simulator preventing them from accumulating errors. This is especially +challenging since, in contrast to previous approaches, access to the +simulator's latent states is not available. We tackle the task by leveraging +observers, a well-known concept from control theory, inferring unknown latent +states from observations and dynamics over time. In our learning-based setting, +we jointly learn the dynamics and an observer that infers the latent states via +the simulator. Thus, the simulator constantly corrects the latent states, +compensating for modeling mismatch caused by learning. To maintain flexibility, +we train an RNN-based residuum for the latent states that cannot be informed by +the simulator. + +
+
+
+
+
+ + ☆ Rethinking Momentum Knowledge Distillation in Online Continual Learning + + +
+ Online Continual Learning (OCL) addresses the problem of training neural +networks on a continuous data stream where multiple classification tasks emerge +in sequence. In contrast to offline Continual Learning, data can be seen only +once in OCL. In this context, replay-based strategies have achieved impressive +results and most state-of-the-art approaches are heavily depending on them. +While Knowledge Distillation (KD) has been extensively used in offline +Continual Learning, it remains under-exploited in OCL, despite its potential. +In this paper, we theoretically analyze the challenges in applying KD to OCL. +We introduce a direct yet effective methodology for applying Momentum Knowledge +Distillation (MKD) to many flagship OCL methods and demonstrate its +capabilities to enhance existing approaches. In addition to improving existing +state-of-the-arts accuracy by more than $10\%$ points on ImageNet100, we shed +light on MKD internal mechanics and impacts during training in OCL. We argue +that similar to replay, MKD should be considered a central component of OCL. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ On Reducing Undesirable Behavior in Deep Reinforcement Learning Models + + +
+ Deep reinforcement learning (DRL) has proven extremely useful in a large +variety of application domains. However, even successful DRL-based software can +exhibit highly undesirable behavior. This is due to DRL training being based on +maximizing a reward function, which typically captures general trends but +cannot precisely capture, or rule out, certain behaviors of the system. In this +paper, we propose a novel framework aimed at drastically reducing the +undesirable behavior of DRL-based software, while maintaining its excellent +performance. In addition, our framework can assist in providing engineers with +a comprehensible characterization of such undesirable behavior. Under the hood, +our approach is based on extracting decision tree classifiers from erroneous +state-action pairs, and then integrating these trees into the DRL training +loop, penalizing the system whenever it performs an error. We provide a +proof-of-concept implementation of our approach, and use it to evaluate the +technique on three significant case studies. We find that our approach can +extend existing frameworks in a straightforward manner, and incurs only a +slight overhead in training time. Further, it incurs only a very slight hit to +performance, or even in some cases - improves it, while significantly reducing +the frequency of undesirable behavior. + +
+
+
+
+
+ + ☆ Enhancing Event Sequence Modeling with Contrastive Relational Inference + + +
+ Neural temporal point processes(TPPs) have shown promise for modeling +continuous-time event sequences. However, capturing the interactions between +events is challenging yet critical for performing inference tasks like +forecasting on event sequence data. Existing TPP models have focused on +parameterizing the conditional distribution of future events but struggle to +model event interactions. In this paper, we propose a novel approach that +leverages Neural Relational Inference (NRI) to learn a relation graph that +infers interactions while simultaneously learning the dynamics patterns from +observational data. Our approach, the Contrastive Relational Inference-based +Hawkes Process (CRIHP), reasons about event interactions under a variational +inference framework. It utilizes intensity-based learning to search for +prototype paths to contrast relationship constraints. Extensive experiments on +three real-world datasets demonstrate the effectiveness of our model in +capturing event interactions for event sequence modeling tasks. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ Generalised Mutual Information: a Framework for Discriminative + Clustering NeurIPS + 2022 + + +
+ In the last decade, recent successes in deep clustering majorly involved the +Mutual Information (MI) as an unsupervised objective for training neural +networks with increasing regularisations. While the quality of the +regularisations have been largely discussed for improvements, little attention +has been dedicated to the relevance of MI as a clustering objective. In this +paper, we first highlight how the maximisation of MI does not lead to +satisfying clusters. We identified the Kullback-Leibler divergence as the main +reason of this behaviour. Hence, we generalise the mutual information by +changing its core distance, introducing the Generalised Mutual Information +(GEMINI): a set of metrics for unsupervised neural network training. Unlike MI, +some GEMINIs do not require regularisations when training as they are +geometry-aware thanks to distances or kernels in the data space. Finally, we +highlight that GEMINIs can automatically select a relevant number of clusters, +a property that has been little studied in deep discriminative clustering +context where the number of clusters is a priori unknown. + +
+
+ comment: Submitted for review at the IEEE Transactions on Pattern Analysis and + Machine Intelligence. This article is an extension of an original NeurIPS + 2022 article [arXiv:2210.06300] +
+
+
+
+
+ + ☆ A Critical Review of Common Log Data Sets Used for Evaluation of + Sequence-based Anomaly Detection Techniques + + +
+ Log data store event execution patterns that correspond to underlying +workflows of systems or applications. While most logs are informative, log data +also include artifacts that indicate failures or incidents. Accordingly, log +data are often used to evaluate anomaly detection techniques that aim to +automatically disclose unexpected or otherwise relevant system behavior +patterns. Recently, detection approaches leveraging deep learning have +increasingly focused on anomalies that manifest as changes of sequential +patterns within otherwise normal event traces. Several publicly available data +sets, such as HDFS, BGL, Thunderbird, OpenStack, and Hadoop, have since become +standards for evaluating these anomaly detection techniques, however, the +appropriateness of these data sets has not been closely investigated in the +past. In this paper we therefore analyze six publicly available log data sets +with focus on the manifestations of anomalies and simple techniques for their +detection. Our findings suggest that most anomalies are not directly related to +sequential manifestations and that advanced detection techniques are not +required to achieve high detection rates on these data sets. + +
+
+
+
+
+ + ☆ Random postprocessing for combinatorial Bayesian optimization + + +
+ Model-based sequential approaches to discrete "black-box" optimization, +including Bayesian optimization techniques, often access the same points +multiple times for a given objective function in interest, resulting in many +steps to find the global optimum. Here, we numerically study the effect of a +postprocessing method on Bayesian optimization that strictly prohibits +duplicated samples in the dataset. We find the postprocessing method +significantly reduces the number of sequential steps to find the global +optimum, especially when the acquisition function is of maximum a posterior +estimation. Our results provide a simple but general strategy to solve the slow +convergence of Bayesian optimization for high-dimensional problems. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ BigVSAN: Enhancing GAN-based Neural Vocoders with Slicing Adversarial + Network ICASSP 2024 + + +
+ Generative adversarial network (GAN)-based vocoders have been intensively +studied because they can synthesize high-fidelity audio waveforms faster than +real-time. However, it has been reported that most GANs fail to obtain the +optimal projection for discriminating between real and fake data in the feature +space. In the literature, it has been demonstrated that slicing adversarial +network (SAN), an improved GAN training framework that can find the optimal +projection, is effective in the image generation task. In this paper, we +investigate the effectiveness of SAN in the vocoding task. For this purpose, we +propose a scheme to modify least-squares GAN, which most GAN-based vocoders +adopt, so that their loss functions satisfy the requirements of SAN. Through +our experiments, we demonstrate that SAN can improve the performance of +GAN-based vocoders, including BigVGAN, with small modifications. Our code is +available at https://github.com/sony/bigvsan. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Roulette: A Semantic Privacy-Preserving Device-Edge Collaborative + Inference Framework for Deep Learning Classification Tasks + + +
+ Deep learning classifiers are crucial in the age of artificial intelligence. +The device-edge-based collaborative inference has been widely adopted as an +efficient framework for promoting its applications in IoT and 5G/6G networks. +However, it suffers from accuracy degradation under non-i.i.d. data +distribution and privacy disclosure. For accuracy degradation, direct use of +transfer learning and split learning is high cost and privacy issues remain. +For privacy disclosure, cryptography-based approaches lead to a huge overhead. +Other lightweight methods assume that the ground truth is non-sensitive and can +be exposed. But for many applications, the ground truth is the user's crucial +privacy-sensitive information. In this paper, we propose a framework of +Roulette, which is a task-oriented semantic privacy-preserving collaborative +inference framework for deep learning classifiers. More than input data, we +treat the ground truth of the data as private information. We develop a novel +paradigm of split learning where the back-end DNN is frozen and the front-end +DNN is retrained to be both a feature extractor and an encryptor. Moreover, we +provide a differential privacy guarantee and analyze the hardness of ground +truth inference attacks. To validate the proposed Roulette, we conduct +extensive performance evaluations using realistic datasets, which demonstrate +that Roulette can effectively defend against various attacks and meanwhile +achieve good model accuracy. In a situation where the non-i.i.d. is very +severe, Roulette improves the inference accuracy by 21\% averaged over +benchmarks, while making the accuracy of discrimination attacks almost +equivalent to random guessing. + +
+
+
+
+
+ + ☆ Combining Thermodynamics-based Model of the Centrifugal Compressors and + Active Machine Learning for Enhanced Industrial Design Optimization ICML + + +
+ The design process of centrifugal compressors requires applying an +optimization process which is computationally expensive due to complex +analytical equations underlying the compressor's dynamical equations. Although +the regression surrogate models could drastically reduce the computational cost +of such a process, the major challenge is the scarcity of data for training the +surrogate model. Aiming to strategically exploit the labeled samples, we +propose the Active-CompDesign framework in which we combine a +thermodynamics-based compressor model (i.e., our internal software for +compressor design) and Gaussian Process-based surrogate model within a +deployable Active Learning (AL) setting. We first conduct experiments in an +offline setting and further, extend it to an online AL framework where a +real-time interaction with the thermodynamics-based compressor's model allows +the deployment in production. ActiveCompDesign shows a significant performance +improvement in surrogate modeling by leveraging on uncertainty-based query +function of samples within the AL framework with respect to the random +selection of data points. Moreover, our framework in production has reduced the +total computational time of compressor's design optimization to around 46% +faster than relying on the internal thermodynamics-based simulator, achieving +the same performance. + +
+
+ comment: Accepted after peer-review at the 1st workshop on Synergy of + Scientific and Machine Learning Modeling, SynS & ML ICML, Honolulu, Hawaii, + USA. July, 2023. Copyright 2023 by the author(s) +
+
+
+
+
+ + ☆ Introducing Thermodynamics-Informed Symbolic Regression -- A Tool for + Thermodynamic Equations of State Development + + +
+ Thermodynamic equations of state (EOS) are essential for many industries as +well as in academia. Even leaving aside the expensive and extensive measurement +campaigns required for the data acquisition, the development of EOS is an +intensely time-consuming process, which does often still heavily rely on expert +knowledge and iterative fine-tuning. To improve upon and accelerate the EOS +development process, we introduce thermodynamics-informed symbolic regression +(TiSR), a symbolic regression (SR) tool aimed at thermodynamic EOS modeling. +TiSR is already a capable SR tool, which was used in the research of +https://doi.org/10.1007/s10765-023-03197-z. It aims to combine an SR base with +the extensions required to work with often strongly scattered experimental +data, different residual pre- and post-processing options, and additional +features required to consider thermodynamic EOS development. Although TiSR is +not ready for end users yet, this paper is intended to report on its current +state, showcase the progress, and discuss (distant and not so distant) future +directions. TiSR is available at https://github.com/scoop-group/TiSR and can be +cited as https://doi.org/10.5281/zenodo.8317547. + +
+
+
+
+
+ + ☆ Dynamic Encoding and Decoding of Information for Split Learning in + Mobile-Edge Computing: Leveraging Information Bottleneck Theory + + +
+ Split learning is a privacy-preserving distributed learning paradigm in which +an ML model (e.g., a neural network) is split into two parts (i.e., an encoder +and a decoder). The encoder shares so-called latent representation, rather than +raw data, for model training. In mobile-edge computing, network functions (such +as traffic forecasting) can be trained via split learning where an encoder +resides in a user equipment (UE) and a decoder resides in the edge network. +Based on the data processing inequality and the information bottleneck (IB) +theory, we present a new framework and training mechanism to enable a dynamic +balancing of the transmission resource consumption with the informativeness of +the shared latent representations, which directly impacts the predictive +performance. The proposed training mechanism offers an encoder-decoder neural +network architecture featuring multiple modes of complexity-relevance +tradeoffs, enabling tunable performance. The adaptability can accommodate +varying real-time network conditions and application requirements, potentially +reducing operational expenditure and enhancing network agility. As a proof of +concept, we apply the training mechanism to a millimeter-wave (mmWave)-enabled +throughput prediction problem. We also offer new insights and highlight some +challenges related to recurrent neural networks from the perspective of the IB +theory. Interestingly, we find a compression phenomenon across the temporal +domain of the sequential model, in addition to the compression phase that +occurs with the number of training epochs. + +
+
+ comment: Accepted to Proc. IEEE Globecom 2023 +
+
+
+
+
+ + ☆ CVE-driven Attack Technique Prediction with Semantic Information + Extraction and a Domain-specific Language Model + + +
+ This paper addresses a critical challenge in cybersecurity: the gap between +vulnerability information represented by Common Vulnerabilities and Exposures +(CVEs) and the resulting cyberattack actions. CVEs provide insights into +vulnerabilities, but often lack details on potential threat actions (tactics, +techniques, and procedures, or TTPs) within the ATT&CK framework. This gap +hinders accurate CVE categorization and proactive countermeasure initiation. +The paper introduces the TTPpredictor tool, which uses innovative techniques to +analyze CVE descriptions and infer plausible TTP attacks resulting from CVE +exploitation. TTPpredictor overcomes challenges posed by limited labeled data +and semantic disparities between CVE and TTP descriptions. It initially +extracts threat actions from unstructured cyber threat reports using Semantic +Role Labeling (SRL) techniques. These actions, along with their contextual +attributes, are correlated with MITRE's attack functionality classes. This +automated correlation facilitates the creation of labeled data, essential for +categorizing novel threat actions into threat functionality classes and TTPs. +The paper presents an empirical assessment, demonstrating TTPpredictor's +effectiveness with accuracy rates of approximately 98% and F1-scores ranging +from 95% to 98% in precise CVE classification to ATT&CK techniques. +TTPpredictor outperforms state-of-the-art language model tools like ChatGPT. +Overall, this paper offers a robust solution for linking CVEs to potential +attack techniques, enhancing cybersecurity practitioners' ability to +proactively identify and mitigate threats. + +
+
+
+
+
+ + ☆ Norm Tweaking: High-performance Low-bit Quantization of Large Language + Models + + +
+ As the size of large language models (LLMs) continues to grow, model +compression without sacrificing accuracy has become a crucial challenge for +deployment. While some quantization methods, such as GPTQ, have made progress +in achieving acceptable 4-bit weight-only quantization, attempts at lower bit +quantization often result in severe performance degradation. In this paper, we +introduce a technique called norm tweaking, which can be used as a plugin in +current PTQ methods to achieve high precision while being cost-efficient. Our +approach is inspired by the observation that rectifying the quantized +activation distribution to match its float counterpart can readily restore +accuracy for LLMs. To achieve this, we carefully design a tweaking strategy +that includes calibration data generation and channel-wise distance constraint +to update the weights of normalization layers for better generalization. We +conduct extensive experiments on various datasets using several open-sourced +LLMs. Our method demonstrates significant improvements in both weight-only +quantization and joint quantization of weights and activations, surpassing +existing PTQ methods. On GLM-130B and OPT-66B, our method even achieves the +same level of accuracy at 2-bit quantization as their float ones. Our simple +and effective approach makes it more practical for real-world applications. + +
+
+
+
+
+ + ☆ Improving diagnosis and prognosis of lung cancer using vision + transformers: A scoping review + + +
+ Vision transformer-based methods are advancing the field of medical +artificial intelligence and cancer imaging, including lung cancer applications. +Recently, many researchers have developed vision transformer-based AI methods +for lung cancer diagnosis and prognosis. This scoping review aims to identify +the recent developments on vision transformer-based AI methods for lung cancer +imaging applications. It provides key insights into how vision transformers +complemented the performance of AI and deep learning methods for lung cancer. +Furthermore, the review also identifies the datasets that contributed to +advancing the field. Of the 314 retrieved studies, this review included 34 +studies published from 2020 to 2022. The most commonly addressed task in these +studies was the classification of lung cancer types, such as lung squamous cell +carcinoma versus lung adenocarcinoma, and identifying benign versus malignant +pulmonary nodules. Other applications included survival prediction of lung +cancer patients and segmentation of lungs. The studies lacked clear strategies +for clinical transformation. SWIN transformer was a popular choice of the +researchers; however, many other architectures were also reported where vision +transformer was combined with convolutional neural networks or UNet model. It +can be concluded that vision transformer-based models are increasingly in +popularity for developing AI methods for lung cancer applications. However, +their computational complexity and clinical relevance are important factors to +be considered for future research work. This review provides valuable insights +for researchers in the field of AI and healthcare to advance the +state-of-the-art in lung cancer diagnosis and prognosis. We provide an +interactive dashboard on lung-cancer.onrender.com/. + +
+
+ comment: submitted to BMC Medical Imaging journal +
+
+
+
+
+ + ☆ On the Effects of Heterogeneous Errors on Multi-fidelity Bayesian + Optimization + + +
+ Bayesian optimization (BO) is a sequential optimization strategy that is +increasingly employed in a wide range of areas including materials design. In +real world applications, acquiring high-fidelity (HF) data through physical +experiments or HF simulations is the major cost component of BO. To alleviate +this bottleneck, multi-fidelity (MF) methods are used to forgo the sole +reliance on the expensive HF data and reduce the sampling costs by querying +inexpensive low-fidelity (LF) sources whose data are correlated with HF +samples. However, existing multi-fidelity BO (MFBO) methods operate under the +following two assumptions that rarely hold in practical applications: (1) LF +sources provide data that are well correlated with the HF data on a global +scale, and (2) a single random process can model the noise in the fused data. +These assumptions dramatically reduce the performance of MFBO when LF sources +are only locally correlated with the HF source or when the noise variance +varies across the data sources. In this paper, we dispense with these incorrect +assumptions by proposing an MF emulation method that (1) learns a noise model +for each data source, and (2) enables MFBO to leverage highly biased LF sources +which are only locally correlated with the HF source. We illustrate the +performance of our method through analytical examples and engineering problems +on materials design. + +
+
+
+
+
+ + ☆ Unifying over-smoothing and over-squashing in graph neural networks: A + physics informed approach and beyond + + +
+ Graph Neural Networks (GNNs) have emerged as one of the leading approaches +for machine learning on graph-structured data. Despite their great success, +critical computational challenges such as over-smoothing, over-squashing, and +limited expressive power continue to impact the performance of GNNs. In this +study, inspired from the time-reversal principle commonly utilized in classical +and quantum physics, we reverse the time direction of the graph heat equation. +The resulted reversing process yields a class of high pass filtering functions +that enhance the sharpness of graph node features. Leveraging this concept, we +introduce the Multi-Scaled Heat Kernel based GNN (MHKG) by amalgamating diverse +filtering functions' effects on node features. To explore more flexible +filtering conditions, we further generalize MHKG into a model termed G-MHKG and +thoroughly show the roles of each element in controlling over-smoothing, +over-squashing and expressive power. Notably, we illustrate that all +aforementioned issues can be characterized and analyzed via the properties of +the filtering functions, and uncover a trade-off between over-smoothing and +over-squashing: enhancing node feature sharpness will make model suffer more +from over-squashing, and vice versa. Furthermore, we manipulate the time again +to show how G-MHKG can handle both two issues under mild conditions. Our +conclusive experiments highlight the effectiveness of proposed models. It +surpasses several GNN baseline models in performance across graph datasets +characterized by both homophily and heterophily. + +
+
+
+
+
+ + ☆ Towards Unsupervised Graph Completion Learning on Graphs with Features + and Structure Missing ICDM + 2023 + + +
+ In recent years, graph neural networks (GNN) have achieved significant +developments in a variety of graph analytical tasks. Nevertheless, GNN's +superior performance will suffer from serious damage when the collected node +features or structure relationships are partially missing owning to numerous +unpredictable factors. Recently emerged graph completion learning (GCL) has +received increasing attention, which aims to reconstruct the missing node +features or structure relationships under the guidance of a specifically +supervised task. Although these proposed GCL methods have made great success, +they still exist the following problems: the reliance on labels, the bias of +the reconstructed node features and structure relationships. Besides, the +generalization ability of the existing GCL still faces a huge challenge when +both collected node features and structure relationships are partially missing +at the same time. To solve the above issues, we propose a more general GCL +framework with the aid of self-supervised learning for improving the task +performance of the existing GNN variants on graphs with features and structure +missing, termed unsupervised GCL (UGCL). Specifically, to avoid the mismatch +between missing node features and structure during the message-passing process +of GNN, we separate the feature reconstruction and structure reconstruction and +design its personalized model in turn. Then, a dual contrastive loss on the +structure level and feature level is introduced to maximize the mutual +information of node representations from feature reconstructing and structure +reconstructing paths for providing more supervision signals. Finally, the +reconstructed node features and structure can be applied to the downstream node +classification task. Extensive experiments on eight datasets, three GNN +variants and five missing rates demonstrate the effectiveness of our proposed +method. + +
+
+ comment: Accepted by 23rd IEEE International Conference on Data Mining (ICDM + 2023) +
+
+
+
+
+ + ☆ SWAP: Exploiting Second-Ranked Logits for Adversarial Attacks on Time + Series + + +
+ Time series classification (TSC) has emerged as a critical task in various +domains, and deep neural models have shown superior performance in TSC tasks. +However, these models are vulnerable to adversarial attacks, where subtle +perturbations can significantly impact the prediction results. Existing +adversarial methods often suffer from over-parameterization or random logit +perturbation, hindering their effectiveness. Additionally, increasing the +attack success rate (ASR) typically involves generating more noise, making the +attack more easily detectable. To address these limitations, we propose SWAP, a +novel attacking method for TSC models. SWAP focuses on enhancing the confidence +of the second-ranked logits while minimizing the manipulation of other logits. +This is achieved by minimizing the Kullback-Leibler divergence between the +target logit distribution and the predictive logit distribution. Experimental +results demonstrate that SWAP achieves state-of-the-art performance, with an +ASR exceeding 50% and an 18% increase compared to existing methods. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Offensive Hebrew Corpus and Detection using BERT CCS + + +
+ Offensive language detection has been well studied in many languages, but it +is lagging behind in low-resource languages, such as Hebrew. In this paper, we +present a new offensive language corpus in Hebrew. A total of 15,881 tweets +were retrieved from Twitter. Each was labeled with one or more of five classes +(abusive, hate, violence, pornographic, or none offensive) by Arabic-Hebrew +bilingual speakers. The annotation process was challenging as each annotator is +expected to be familiar with the Israeli culture, politics, and practices to +understand the context of each tweet. We fine-tuned two Hebrew BERT models, +HeBERT and AlephBERT, using our proposed dataset and another published dataset. +We observed that our data boosts HeBERT performance by 2% when combined with +D_OLaH. Fine-tuning AlephBERT on our data and testing on D_OLaH yields 69% +accuracy, while fine-tuning on D_OLaH and testing on our data yields 57% +accuracy, which may be an indication to the generalizability our data offers. +Our dataset and fine-tuned models are available on GitHub and Huggingface. + +
+
+ comment: 8 pages, 1 figure, The 20th ACS/IEEE International Conference on + Computer Systems and Applications (AICCSA) +
+
+
+
+
+ + ☆ Unveiling the frontiers of deep learning: innovations shaping diverse + domains + + +
+ Deep learning (DL) enables the development of computer models that are +capable of learning, visualizing, optimizing, refining, and predicting data. In +recent years, DL has been applied in a range of fields, including audio-visual +data processing, agriculture, transportation prediction, natural language, +biomedicine, disaster management, bioinformatics, drug design, genomics, face +recognition, and ecology. To explore the current state of deep learning, it is +necessary to investigate the latest developments and applications of deep +learning in these disciplines. However, the literature is lacking in exploring +the applications of deep learning in all potential sectors. This paper thus +extensively investigates the potential applications of deep learning across all +major fields of study as well as the associated benefits and challenges. As +evidenced in the literature, DL exhibits accuracy in prediction and analysis, +makes it a powerful computational tool, and has the ability to articulate +itself and optimize, making it effective in processing data with no prior +training. Given its independence from training data, deep learning necessitates +massive amounts of data for effective analysis and processing, much like data +volume. To handle the challenge of compiling huge amounts of medical, +scientific, healthcare, and environmental data for use in deep learning, gated +architectures like LSTMs and GRUs can be utilized. For multimodal learning, +shared neurons in the neural network for all activities and specialized neurons +for particular tasks are necessary. + +
+
+ comment: 64 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Addressing Imperfect Symmetry: a Novel Symmetry-Learning Actor-Critic + Extension + + +
+ Symmetry, a fundamental concept to understand our environment, often +oversimplifies reality from a mathematical perspective. Humans are a prime +example, deviating from perfect symmetry in terms of appearance and cognitive +biases (e.g. having a dominant hand). Nevertheless, our brain can easily +overcome these imperfections and efficiently adapt to symmetrical tasks. The +driving motivation behind this work lies in capturing this ability through +reinforcement learning. To this end, we introduce Adaptive Symmetry Learning +(ASL) $\unicode{x2013}$ a model-minimization actor-critic extension that +addresses incomplete or inexact symmetry descriptions by adapting itself during +the learning process. ASL consists of a symmetry fitting component and a +modular loss function that enforces a common symmetric relation across all +states while adapting to the learned policy. The performance of ASL is compared +to existing symmetry-enhanced methods in a case study involving a four-legged +ant model for multidirectional locomotion tasks. The results demonstrate that +ASL is capable of recovering from large perturbations and generalizing +knowledge to hidden symmetric states. It achieves comparable or better +performance than alternative methods in most scenarios, making it a valuable +approach for leveraging model symmetry while compensating for inherent +perturbations. + +
+
+
+
+
+ + ☆ Improved Outlier Robust Seeding for k-means + + +
+ The $k$-means is a popular clustering objective, although it is inherently +non-robust and sensitive to outliers. Its popular seeding or initialization +called $k$-means++ uses $D^{2}$ sampling and comes with a provable $O(\log k)$ +approximation guarantee \cite{AV2007}. However, in the presence of adversarial +noise or outliers, $D^{2}$ sampling is more likely to pick centers from distant +outliers instead of inlier clusters, and therefore its approximation guarantees +\textit{w.r.t.} $k$-means solution on inliers, does not hold. + Assuming that the outliers constitute a constant fraction of the given data, +we propose a simple variant in the $D^2$ sampling distribution, which makes it +robust to the outliers. Our algorithm runs in $O(ndk)$ time, outputs $O(k)$ +clusters, discards marginally more points than the optimal number of outliers, +and comes with a provable $O(1)$ approximation guarantee. + Our algorithm can also be modified to output exactly $k$ clusters instead of +$O(k)$ clusters, while keeping its running time linear in $n$ and $d$. This is +an improvement over previous results for robust $k$-means based on LP +relaxation and rounding \cite{Charikar}, \cite{KrishnaswamyLS18} and +\textit{robust $k$-means++} \cite{DeshpandeKP20}. Our empirical results show +the advantage of our algorithm over $k$-means++~\cite{AV2007}, uniform random +seeding, greedy sampling for $k$ means~\cite{tkmeanspp}, and robust +$k$-means++~\cite{DeshpandeKP20}, on standard real-world and synthetic data +sets used in previous work. Our proposal is easily amenable to scalable, +faster, parallel implementations of $k$-means++ \cite{Bahmani,BachemL017} and +is of independent interest for coreset constructions in the presence of +outliers \cite{feldman2007ptas,langberg2010universal,feldman2011unified}. + +
+
+
+
+
+ + ☆ Certifying LLM Safety against Adversarial Prompting + + +
+ Large language models (LLMs) released for public use incorporate guardrails +to ensure their output is safe, often referred to as "model alignment." An +aligned language model should decline a user's request to produce harmful +content. However, such safety measures are vulnerable to adversarial prompts, +which contain maliciously designed token sequences to circumvent the model's +safety guards and cause it to produce harmful content. In this work, we +introduce erase-and-check, the first framework to defend against adversarial +prompts with verifiable safety guarantees. We erase tokens individually and +inspect the resulting subsequences using a safety filter. Our procedure labels +the input prompt as harmful if any subsequences or the input prompt are +detected as harmful by the filter. This guarantees that any adversarial +modification of a harmful prompt up to a certain size is also labeled harmful. +We defend against three attack modes: i) adversarial suffix, which appends an +adversarial sequence at the end of the prompt; ii) adversarial insertion, where +the adversarial sequence is inserted anywhere in the middle of the prompt; and +iii) adversarial infusion, where adversarial tokens are inserted at arbitrary +positions in the prompt, not necessarily as a contiguous block. Empirical +results demonstrate that our technique obtains strong certified safety +guarantees on harmful prompts while maintaining good performance on safe +prompts. For example, against adversarial suffixes of length 20, it certifiably +detects 93% of the harmful prompts and labels 94% of the safe prompts as safe +using the open source language model Llama 2 as the safety filter. + +
+
+
+
+
+ + ☆ Diffusion-EDFs: Bi-equivariant Denoising Generative Modeling on SE(3) + for Visual Robotic Manipulation + + +
+ Recent studies have verified that equivariant methods can significantly +improve the data efficiency, generalizability, and robustness in robot +learning. Meanwhile, denoising diffusion-based generative modeling has recently +gained significant attention as a promising approach for robotic manipulation +learning from demonstrations with stochastic behaviors. In this paper, we +present Diffusion-EDFs, a novel approach that incorporates spatial +roto-translation equivariance, i.e., SE(3)-equivariance to diffusion generative +modeling. By integrating SE(3)-equivariance into our model architectures, we +demonstrate that our proposed method exhibits remarkable data efficiency, +requiring only 5 to 10 task demonstrations for effective end-to-end training. +Furthermore, our approach showcases superior generalizability compared to +previous diffusion-based manipulation methods. + +
+
+ comment: 27 pages, 4 figures +
+
+
+
+
+ + ☆ RLSynC: Offline-Online Reinforcement Learning for Synthon Completion + + +
+ Retrosynthesis is the process of determining the set of reactant molecules +that can react to form a desired product. Semi-template-based retrosynthesis +methods, which imitate the reverse logic of synthesis reactions, first predict +the reaction centers in the products, and then complete the resulting synthons +back into reactants. These methods enable necessary interpretability and high +practical utility to inform synthesis planning. We develop a new offline-online +reinforcement learning method RLSynC for synthon completion in +semi-template-based methods. RLSynC assigns one agent to each synthon, all of +which complete the synthons by conducting actions step by step in a +synchronized fashion. RLSynC learns the policy from both offline training +episodes and online interactions which allow RLSynC to explore new reaction +spaces. RLSynC uses a forward synthesis model to evaluate the likelihood of the +predicted reactants in synthesizing a product, and thus guides the action +search. We compare RLSynC with the state-of-the-art retrosynthesis methods. Our +experimental results demonstrate that RLSynC can outperform these methods with +improvement as high as 14.9% on synthon completion, and 14.0% on +retrosynthesis, highlighting its potential in synthesis planning. + +
+
+ comment: 11 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ Marketing Budget Allocation with Offline Constrained Deep Reinforcement + Learning WSDM 23 + + +
+ We study the budget allocation problem in online marketing campaigns that +utilize previously collected offline data. We first discuss the long-term +effect of optimizing marketing budget allocation decisions in the offline +setting. To overcome the challenge, we propose a novel game-theoretic offline +value-based reinforcement learning method using mixed policies. The proposed +method reduces the need to store infinitely many policies in previous methods +to only constantly many policies, which achieves nearly optimal policy +efficiency, making it practical and favorable for industrial usage. We further +show that this method is guaranteed to converge to the optimal policy, which +cannot be achieved by previous value-based reinforcement learning methods for +marketing budget allocation. Our experiments on a large-scale marketing +campaign with tens-of-millions users and more than one billion budget verify +the theoretical results and show that the proposed method outperforms various +baseline methods. The proposed method has been successfully deployed to serve +all the traffic of this marketing campaign. + +
+
+ comment: WSDM 23, Best Paper Candidate +
+
+
+
+
+ + ☆ Contrastive Learning as Kernel Approximation + + +
+ In standard supervised machine learning, it is necessary to provide a label +for every input in the data. While raw data in many application domains is +easily obtainable on the Internet, manual labelling of this data is +prohibitively expensive. To circumvent this issue, contrastive learning methods +produce low-dimensional vector representations (also called features) of +high-dimensional inputs on large unlabelled datasets. This is done by training +with a contrastive loss function, which enforces that similar inputs have high +inner product and dissimilar inputs have low inner product in the feature +space. Rather than annotating each input individually, it suffices to define a +means of sampling pairs of similar and dissimilar inputs. Contrastive features +can then be fed as inputs to supervised learning systems on much smaller +labelled datasets to obtain high accuracy on end tasks of interest. + The goal of this thesis is to provide an overview of the current theoretical +understanding of contrastive learning, specifically as it pertains to the +minimizers of contrastive loss functions and their relationship to prior +methods for learning features from unlabelled data. We highlight popular +contrastive loss functions whose minimizers implicitly approximate a positive +semidefinite (PSD) kernel. The latter is a well-studied object in functional +analysis and learning theory that formalizes a notion of similarity between +elements of a space. PSD kernels provide an implicit definition of features +through the theory of reproducing kernel Hilbert spaces. + +
+
+ comment: Master's (M.Sc.) Thesis +
+
+
+
+
+ + ☆ TFBEST: Dual-Aspect Transformer with Learnable Positional Encoding for + Failure Prediction + + +
+ Hard Disk Drive (HDD) failures in datacenters are costly - from catastrophic +data loss to a question of goodwill, stakeholders want to avoid it like the +plague. An important tool in proactively monitoring against HDD failure is +timely estimation of the Remaining Useful Life (RUL). To this end, the +Self-Monitoring, Analysis and Reporting Technology employed within HDDs +(S.M.A.R.T.) provide critical logs for long-term maintenance of the security +and dependability of these essential data storage devices. Data-driven +predictive models in the past have used these S.M.A.R.T. logs and CNN/RNN based +architectures heavily. However, they have suffered significantly in providing a +confidence interval around the predicted RUL values as well as in processing +very long sequences of logs. In addition, some of these approaches, such as +those based on LSTMs, are inherently slow to train and have tedious feature +engineering overheads. To overcome these challenges, in this work we propose a +novel transformer architecture - a Temporal-fusion Bi-encoder Self-attention +Transformer (TFBEST) for predicting failures in hard-drives. It is an +encoder-decoder based deep learning technique that enhances the context gained +from understanding health statistics sequences and predicts a sequence of the +number of days remaining before a disk potentially fails. In this paper, we +also provide a novel confidence margin statistic that can help manufacturers +replace a hard-drive within a time frame. Experiments on Seagate HDD data show +that our method significantly outperforms the state-of-the-art RUL prediction +methods during testing over the exhaustive 10-year data from Backblaze +(2013-present). Although validated on HDD failure prediction, the TFBEST +architecture is well-suited for other prognostics applications and may be +adapted for allied regression problems. + +
+
+ comment: 9 pages, 6 figures, 2 tables +
+
+
+
+
+ + ☆ Epi-Curriculum: Episodic Curriculum Learning for Low-Resource Domain + Adaptation in Neural Machine Translation + + +
+ Neural Machine Translation (NMT) models have become successful, but their +performance remains poor when translating on new domains with a limited number +of data. In this paper, we present a novel approach Epi-Curriculum to address +low-resource domain adaptation (DA), which contains a new episodic training +framework along with denoised curriculum learning. Our episodic training +framework enhances the model's robustness to domain shift by episodically +exposing the encoder/decoder to an inexperienced decoder/encoder. The denoised +curriculum learning filters the noised data and further improves the model's +adaptability by gradually guiding the learning process from easy to more +difficult tasks. Experiments on English-German and English-Romanian translation +show that: (i) Epi-Curriculum improves both model's robustness and adaptability +in seen and unseen domains; (ii) Our episodic training framework enhances the +encoder and decoder's robustness to domain shift. + +
+
+
+
+
+ + ☆ Multiclass Alignment of Confidence and Certainty for Network Calibration + + +
+ Deep neural networks (DNNs) have made great strides in pushing the +state-of-the-art in several challenging domains. Recent studies reveal that +they are prone to making overconfident predictions. This greatly reduces the +overall trust in model predictions, especially in safety-critical applications. +Early work in improving model calibration employs post-processing techniques +which rely on limited parameters and require a hold-out set. Some recent +train-time calibration methods, which involve all model parameters, can +outperform the postprocessing methods. To this end, we propose a new train-time +calibration method, which features a simple, plug-and-play auxiliary loss known +as multi-class alignment of predictive mean confidence and predictive certainty +(MACC). It is based on the observation that a model miscalibration is directly +related to its predictive certainty, so a higher gap between the mean +confidence and certainty amounts to a poor calibration both for in-distribution +and out-of-distribution predictions. Armed with this insight, our proposed loss +explicitly encourages a confident (or underconfident) model to also provide a +low (or high) spread in the presoftmax distribution. Extensive experiments on +ten challenging datasets, covering in-domain, out-domain, non-visual +recognition and medical image classification scenarios, show that our method +achieves state-of-the-art calibration performance for both in-domain and +out-domain predictions. Our code and models will be publicly released. + +
+
+ comment: Accepted at GCPR 2023 +
+
+
+
+
+ + ☆ Deep Reinforcement Learning from Hierarchical Weak Preference Feedback + + +
+ Reward design is a fundamental, yet challenging aspect of practical +reinforcement learning (RL). For simple tasks, researchers typically handcraft +the reward function, e.g., using a linear combination of several reward +factors. However, such reward engineering is subject to approximation bias, +incurs large tuning cost, and often cannot provide the granularity required for +complex tasks. To avoid these difficulties, researchers have turned to +reinforcement learning from human feedback (RLHF), which learns a reward +function from human preferences between pairs of trajectory sequences. By +leveraging preference-based reward modeling, RLHF learns complex rewards that +are well aligned with human preferences, allowing RL to tackle increasingly +difficult problems. Unfortunately, the applicability of RLHF is limited due to +the high cost and difficulty of obtaining human preference data. In light of +this cost, we investigate learning reward functions for complex tasks with less +human effort; simply by ranking the importance of the reward factors. More +specifically, we propose a new RL framework -- HERON, which compares +trajectories using a hierarchical decision tree induced by the given ranking. +These comparisons are used to train a preference-based reward model, which is +then used for policy learning. We find that our framework can not only train +high performing agents on a variety of difficult tasks, but also provide +additional benefits such as improved sample efficiency and robustness. Our code +is available at https://github.com/abukharin3/HERON. + +
+
+ comment: 28 Pages, 15 figures +
+
+
+
+
+ + ☆ Community-Based Hierarchical Positive-Unlabeled (PU) Model Fusion for + Chronic Disease Prediction CIKM 2023 + + +
+ Positive-Unlabeled (PU) Learning is a challenge presented by binary +classification problems where there is an abundance of unlabeled data along +with a small number of positive data instances, which can be used to address +chronic disease screening problem. State-of-the-art PU learning methods have +resulted in the development of various risk estimators, yet they neglect the +differences among distinct populations. To address this issue, we present a +novel Positive-Unlabeled Learning Tree (PUtree) algorithm. PUtree is designed +to take into account communities such as different age or income brackets, in +tasks of chronic disease prediction. We propose a novel approach for binary +decision-making, which hierarchically builds community-based PU models and then +aggregates their deliverables. Our method can explicate each PU model on the +tree for the optimized non-leaf PU node splitting. Furthermore, a mask-recovery +data augmentation strategy enables sufficient training of the model in +individual communities. Additionally, the proposed approach includes an +adversarial PU risk estimator to capture hierarchical PU-relationships, and a +model fusion network that integrates data from each tree path, resulting in +robust binary classification results. We demonstrate the superior performance +of PUtree as well as its variants on two benchmarks and a new +diabetes-prediction dataset. + +
+
+ comment: Accepted by CIKM 2023 as a long paper +
+
+
+
+
+ + ☆ ViewMix: Augmentation for Robust Representation in Self-Supervised + Learning + + +
+ Joint Embedding Architecture-based self-supervised learning methods have +attributed the composition of data augmentations as a crucial factor for their +strong representation learning capabilities. While regional dropout strategies +have proven to guide models to focus on lesser indicative parts of the objects +in supervised methods, it hasn't been adopted by self-supervised methods for +generating positive pairs. This is because the regional dropout methods are not +suitable for the input sampling process of the self-supervised methodology. +Whereas dropping informative pixels from the positive pairs can result in +inefficient training, replacing patches of a specific object with a different +one can steer the model from maximizing the agreement between different +positive pairs. Moreover, joint embedding representation learning methods have +not made robustness their primary training outcome. To this end, we propose the +ViewMix augmentation policy, specially designed for self-supervised learning, +upon generating different views of the same image, patches are cut and pasted +from one view to another. By leveraging the different views created by this +augmentation strategy, multiple joint embedding-based self-supervised +methodologies obtained better localization capability and consistently +outperformed their corresponding baseline methods. It is also demonstrated that +incorporating ViewMix augmentation policy promotes robustness of the +representations in the state-of-the-art methods. Furthermore, our +experimentation and analysis of compute times suggest that ViewMix augmentation +doesn't introduce any additional overhead compared to other counterparts. + +
+
+
+
+
+ + ☆ Ensemble linear interpolators: The role of ensembling + + +
+ Interpolators are unstable. For example, the mininum $\ell_2$ norm least +square interpolator exhibits unbounded test errors when dealing with noisy +data. In this paper, we study how ensemble stabilizes and thus improves the +generalization performance, measured by the out-of-sample prediction risk, of +an individual interpolator. We focus on bagged linear interpolators, as bagging +is a popular randomization-based ensemble method that can be implemented in +parallel. We introduce the multiplier-bootstrap-based bagged least square +estimator, which can then be formulated as an average of the sketched least +square estimators. The proposed multiplier bootstrap encompasses the classical +bootstrap with replacement as a special case, along with a more intriguing +variant which we call the Bernoulli bootstrap. + Focusing on the proportional regime where the sample size scales +proportionally with the feature dimensionality, we investigate the +out-of-sample prediction risks of the sketched and bagged least square +estimators in both underparametrized and overparameterized regimes. Our results +reveal the statistical roles of sketching and bagging. In particular, sketching +modifies the aspect ratio and shifts the interpolation threshold of the minimum +$\ell_2$ norm estimator. However, the risk of the sketched estimator continues +to be unbounded around the interpolation threshold due to excessive variance. +In stark contrast, bagging effectively mitigates this variance, leading to a +bounded limiting out-of-sample prediction risk. To further understand this +stability improvement property, we establish that bagging acts as a form of +implicit regularization, substantiated by the equivalence of the bagged +estimator with its explicitly regularized counterpart. We also discuss several +extensions. + +
+
+ comment: 30-page main text including figures and tables, 50-page appendix +
+
+
+
+
+ + ☆ Source Camera Identification and Detection in Digital Videos through + Blind Forensics + + +
+ Source camera identification in digital videos is the problem of associating +an unknown digital video with its source device, within a closed set of +possible devices. The existing techniques in source detection of digital videos +try to find a fingerprint of the actual source in the video in form of PRNU +(Photo Response Non--Uniformity), and match it against the SPN (Sensor Pattern +Noise) of each possible device. The highest correlation indicates the correct +source. We investigate the problem of identifying a video source through a +feature based approach using machine learning. In this paper, we present a +blind forensic technique of video source authentication and identification, +based on feature extraction, feature selection and subsequent source +classification. The main aim is to determine whether a claimed source for a +video is actually its original source. If not, we identify its original source. +Our experimental results prove the efficiency of the proposed method compared +to traditional fingerprint based technique. + +
+
+ comment: Submitted to IEEE for inclusion in Xplore- Digital Library. Paper + presented at the International Conference on Recent Trends in Computational + Engineering & Technologies (ICRTCET 18)with Paper Id: ICRTCET-227 +
+
+
+
+
+ + ☆ Using Neural Networks for Fast SAR Roughness Estimation of High + Resolution Images + + +
+ The analysis of Synthetic Aperture Radar (SAR) imagery is an important step +in remote sensing applications, and it is a challenging problem due to its +inherent speckle noise. One typical solution is to model the data using the +$G_I^0$ distribution and extract its roughness information, which in turn can +be used in posterior imaging tasks, such as segmentation, classification and +interpretation. This leads to the need of quick and reliable estimation of the +roughness parameter from SAR data, especially with high resolution images. +Unfortunately, traditional parameter estimation procedures are slow and prone +to estimation failures. In this work, we proposed a neural network-based +estimation framework that first learns how to predict underlying parameters of +$G_I^0$ samples and then can be used to estimate the roughness of unseen data. +We show that this approach leads to an estimator that is quicker, yields less +estimation error and is less prone to failures than the traditional estimation +procedures for this problem, even when we use a simple network. More +importantly, we show that this same methodology can be generalized to handle +image inputs and, even if trained on purely synthetic data for a few seconds, +is able to perform real time pixel-wise roughness estimation for high +resolution real SAR imagery. + +
+
+
+
+
+ + ☆ REBOOT: Reuse Data for Bootstrapping Efficient Real-World Dexterous + Manipulation + + +
+ Dexterous manipulation tasks involving contact-rich interactions pose a +significant challenge for both model-based control systems and imitation +learning algorithms. The complexity arises from the need for multi-fingered +robotic hands to dynamically establish and break contacts, balance +non-prehensile forces, and control large degrees of freedom. Reinforcement +learning (RL) offers a promising approach due to its general applicability and +capacity to autonomously acquire optimal manipulation strategies. However, its +real-world application is often hindered by the necessity to generate a large +number of samples, reset the environment, and obtain reward signals. In this +work, we introduce an efficient system for learning dexterous manipulation +skills with RL to alleviate these challenges. The main idea of our approach is +the integration of recent advances in sample-efficient RL and replay buffer +bootstrapping. This combination allows us to utilize data from different tasks +or objects as a starting point for training new tasks, significantly improving +learning efficiency. Additionally, our system completes the real-world training +cycle by incorporating learned resets via an imitation-based pickup policy as +well as learned reward functions, eliminating the need for manual resets and +reward engineering. We demonstrate the benefits of reusing past data as replay +buffer initialization for new tasks, for instance, the fast acquisition of +intricate manipulation skills in the real world on a four-fingered robotic +hand. (Videos: https://sites.google.com/view/reboot-dexterous) + +
+
+ comment: Accepted at CORL 2023. The first two authors contributed equally +
+
+
+
+
+ + ☆ Fitness Approximation through Machine Learning + + +
+ We present a novel approach to performing fitness approximation in genetic +algorithms (GAs) using machine-learning (ML) models, focusing on evolutionary +agents in Gymnasium (game) simulators -- where fitness computation is costly. +Maintaining a dataset of sampled individuals along with their actual fitness +scores, we continually update throughout an evolutionary run a +fitness-approximation ML model. We compare different methods for: 1) switching +between actual and approximate fitness, 2) sampling the population, and 3) +weighting the samples. Experimental findings demonstrate significant +improvement in evolutionary runtimes, with fitness scores that are either +identical or slightly lower than that of the fully run GA -- depending on the +ratio of approximate-to-actual-fitness computation. Our approach is generic and +can be easily applied to many different domains. + +
+
+ comment: 9 pages, 5 tables, 2 figures. Submitted to IEEE Transactions on + Emerging Topics in Computational Intelligence +
+
+
+
+
+ + ☆ Robotic Table Tennis: A Case Study into a High Speed Learning System + + +
+ We present a deep-dive into a real-world robotic learning system that, in +previous work, was shown to be capable of hundreds of table tennis rallies with +a human and has the ability to precisely return the ball to desired targets. +This system puts together a highly optimized perception subsystem, a high-speed +low-latency robot controller, a simulation paradigm that can prevent damage in +the real world and also train policies for zero-shot transfer, and automated +real world environment resets that enable autonomous training and evaluation on +physical robots. We complement a complete system description, including +numerous design decisions that are typically not widely disseminated, with a +collection of studies that clarify the importance of mitigating various sources +of latency, accounting for training and deployment distribution shifts, +robustness of the perception system, sensitivity to policy hyper-parameters, +and choice of action space. A video demonstrating the components of the system +and details of experimental results can be found at +https://youtu.be/uFcnWjB42I0. + +
+
+ comment: Published and presented at Robotics: Science and Systems (RSS2023) +
+
+
+
+
+ + ♻ ☆ Spacetime-Efficient Low-Depth Quantum State Preparation with + Applications + + +
+ We propose a novel deterministic method for preparing arbitrary quantum +states. When our protocol is compiled into CNOT and arbitrary single-qubit +gates, it prepares an $N$-dimensional state in depth $O(\log(N))$ and spacetime +allocation (a metric that accounts for the fact that oftentimes some ancilla +qubits need not be active for the entire circuit) $O(N)$, which are both +optimal. When compiled into the $\{\mathrm{H,S,T,CNOT}\}$ gate set, we show +that it requires asymptotically fewer quantum resources than previous methods. +Specifically, it prepares an arbitrary state up to error $\epsilon$ in depth +$O(\log(N/\epsilon))$ and spacetime allocation $O(N\log(\log(N)/\epsilon))$, +improving over $O(\log(N)\log(N/\epsilon))$ and $O(N\log(N/\epsilon))$, +respectively. We illustrate how the reduced spacetime allocation of our +protocol enables rapid preparation of many disjoint states with only +constant-factor ancilla overhead -- $O(N)$ ancilla qubits are reused +efficiently to prepare a product state of $w$ $N$-dimensional states in depth +$O(w + \log(N))$ rather than $O(w\log(N))$, achieving effectively constant +depth per state. We highlight several applications where this ability would be +useful, including quantum machine learning, Hamiltonian simulation, and solving +linear systems of equations. We provide quantum circuit descriptions of our +protocol, detailed pseudocode, and gate-level implementation examples using +Braket. + +
+
+
+
+
+ + ♻ ☆ Interpretation of High-Dimensional Linear Regression: Effects of + Nullspace and Regularization Demonstrated on Battery Data + + +
+ High-dimensional linear regression is important in many scientific fields. +This article considers discrete measured data of underlying smooth latent +processes, as is often obtained from chemical or biological systems. +Interpretation in high dimensions is challenging because the nullspace and its +interplay with regularization shapes regression coefficients. The data's +nullspace contains all coefficients that satisfy $\mathbf{Xw}=\mathbf{0}$, thus +allowing very different coefficients to yield identical predictions. We +developed an optimization formulation to compare regression coefficients and +coefficients obtained by physical engineering knowledge to understand which +part of the coefficient differences are close to the nullspace. This nullspace +method is tested on a synthetic example and lithium-ion battery data. The case +studies show that regularization and z-scoring are design choices that, if +chosen corresponding to prior physical knowledge, lead to interpretable +regression results. Otherwise, the combination of the nullspace and +regularization hinders interpretability and can make it impossible to obtain +regression coefficients close to the true coefficients when there is a true +underlying linear model. Furthermore, we demonstrate that regression methods +that do not produce coefficients orthogonal to the nullspace, such as fused +lasso, can improve interpretability. In conclusion, the insights gained from +the nullspace perspective help to make informed design choices for building +regression models on high-dimensional data and reasoning about potential +underlying linear models, which are important for system optimization and +improving scientific understanding. + +
+
+ comment: Manuscript: 14 pages, 7 figures; Supplementary Information: 4 pages, + 2 figures; Code available: https://github.com/JoachimSchaeffer/HDRegAnalytics +
+
+
+
+
+ + ♻ ☆ Loss Functions and Metrics in Deep Learning + + +
+ One of the essential components of deep learning is the choice of the loss +function and performance metrics used to train and evaluate models. This paper +reviews the most prevalent loss functions and performance measurements in deep +learning. We examine the benefits and limits of each technique and illustrate +their application to various deep-learning problems. Our review aims to give a +comprehensive picture of the different loss functions and performance +indicators used in the most common deep learning tasks and help practitioners +choose the best method for their specific task. + +
+
+ comment: 53 pages, 5 figures, 7 tables, 86 equations +
+
+
+
+
+ + ♻ ☆ CPPF++: Uncertainty-Aware Sim2Real Object Pose Estimation by Vote + Aggregation + + +
+ Object pose estimation constitutes a critical area within the domain of 3D +vision. While contemporary state-of-the-art methods that leverage real-world +pose annotations have demonstrated commendable performance, the procurement of +such real-world training data incurs substantial costs. This paper focuses on a +specific setting wherein only 3D CAD models are utilized as a priori knowledge, +devoid of any background or clutter information. We introduce a novel method, +CPPF++, designed for sim-to-real pose estimation. This method builds upon the +foundational point-pair voting scheme of CPPF, reconceptualizing it through a +probabilistic lens. To address the challenge of voting collision, we model +voting uncertainty by estimating the probabilistic distribution of each point +pair within the canonical space. This approach is further augmented by +iterative noise filtering, employed to eradicate votes associated with +backgrounds or clutters. Additionally, we enhance the context provided by each +voting unit by introducing $N$-point tuples. In conjunction with this +methodological contribution, we present a new category-level pose estimation +dataset, DiversePose 300. This dataset is specifically crafted to facilitate a +more rigorous evaluation of current state-of-the-art methods, encompassing a +broader and more challenging array of real-world scenarios. Empirical results +substantiate the efficacy of our proposed method, revealing a significant +reduction in the disparity between simulation and real-world performance. + +
+
+
+
+
+ + ♻ ☆ Open problems in causal structure learning: A case study of COVID-19 in + the UK + + +
+ Causal machine learning (ML) algorithms recover graphical structures that +tell us something about cause-and-effect relationships. The causal +representation praovided by these algorithms enables transparency and +explainability, which is necessary for decision making in critical real-world +problems. Yet, causal ML has had limited impact in practice compared to +associational ML. This paper investigates the challenges of causal ML with +application to COVID-19 UK pandemic data. We collate data from various public +sources and investigate what the various structure learning algorithms learn +from these data. We explore the impact of different data formats on algorithms +spanning different classes of learning, and assess the results produced by each +algorithm, and groups of algorithms, in terms of graphical structure, model +dimensionality, sensitivity analysis, confounding variables, predictive and +interventional inference. We use these results to highlight open problems in +causal structure learning and directions for future research. To facilitate +future work, we make all graphs, models, data sets, and source code publicly +available online. + +
+
+
+
+
+ + ♻ ☆ Towards Privacy-Aware Causal Structure Learning in Federated Setting + + +
+ Causal structure learning has been extensively studied and widely used in +machine learning and various applications. To achieve an ideal performance, +existing causal structure learning algorithms often need to centralize a large +amount of data from multiple data sources. However, in the privacy-preserving +setting, it is impossible to centralize data from all sources and put them +together as a single dataset. To preserve data privacy, federated learning as a +new learning paradigm has attracted much attention in machine learning in +recent years. In this paper, we study a privacy-aware causal structure learning +problem in the federated setting and propose a novel Federated PC (FedPC) +algorithm with two new strategies for preserving data privacy without +centralizing data. Specifically, we first propose a novel layer-wise +aggregation strategy for a seamless adaptation of the PC algorithm into the +federated learning paradigm for federated skeleton learning, then we design an +effective strategy for learning consistent separation sets for federated edge +orientation. The extensive experiments validate that FedPC is effective for +causal structure learning in a federated learning setting. + +
+
+ comment: This paper has been accepted by the journal IEEE Transactions on Big + Data, and it contains 21 pages, 9 figures and 15 tables +
+
+
+
+
+ + ♻ ☆ Conflict-Aware Active Automata Learning (Extended Version) + + +
+ Active automata learning algorithms cannot easily handle conflict in the +observation data (different outputs observed for the same inputs). This +inherent inability to recover after a conflict impairs their effective +applicability in scenarios where noise is present or the system under learning +is mutating. + We propose the Conflict-Aware Active Automata Learning (C3AL) framework to +enable handling conflicting information during the learning process. The core +idea is to consider the so-called observation tree as a first-class citizen in +the learning process. Though this idea is explored in recent work, we take it +to its full effect by enabling its use with any existing learner and minimizing +the number of tests performed on the system under learning, specially in the +face of conflicts. We evaluate C3AL in a large set of benchmarks, covering over +30 different realistic targets, and over 18,000 different scenarios. The +results of the evaluation show that C3AL is a suitable alternative framework +for closed-box learning that can better handle noise and mutations. + +
+
+ comment: 37 pages, 11 figures, GandALF 2023 +
+
+
+
+
+ + ♻ ☆ Memory Efficient Optimizers with 4-bit States + + +
+ Optimizer states are a major source of memory consumption for training neural +networks, limiting the maximum trainable model within given memory budget. +Compressing the optimizer states from 32-bit floating points to lower bitwidth +is promising to reduce the training memory footprint, while the current lowest +achievable bitwidth is 8-bit. In this work, we push optimizer states bitwidth +down to 4-bit through a detailed empirical analysis of first and second +moments. Specifically, we find that moments have complicated outlier patterns, +that current block-wise quantization cannot accurately approximate. We use a +smaller block size and propose to utilize both row-wise and column-wise +information for better quantization. We further identify a zero point problem +of quantizing the second moment, and solve this problem with a linear quantizer +that excludes the zero point. Our 4-bit optimizer is evaluated on a wide +variety of benchmarks including natural language understanding, machine +translation, image classification, and instruction tuning. On all the tasks our +optimizers can achieve comparable accuracy with their full-precision +counterparts, while enjoying better memory efficiency. + +
+
+ comment: 35 pages +
+
+
+
+
+ + ♻ ☆ A Topological Deep Learning Framework for Neural Spike Decoding + + +
+ The brain's spatial orientation system uses different neuron ensembles to aid +in environment-based navigation. Two of the ways brains encode spatial +information is through head direction cells and grid cells. Brains use head +direction cells to determine orientation whereas grid cells consist of layers +of decked neurons that overlay to provide environment-based navigation. These +neurons fire in ensembles where several neurons fire at once to activate a +single head direction or grid. We want to capture this firing structure and use +it to decode head direction grid cell data. Understanding, representing, and +decoding these neural structures requires models that encompass higher order +connectivity, more than the 1-dimensional connectivity that traditional +graph-based models provide. To that end, in this work, we develop a topological +deep learning framework for neural spike train decoding. Our framework combines +unsupervised simplicial complex discovery with the power of deep learning via a +new architecture we develop herein called a simplicial convolutional recurrent +neural network. Simplicial complexes, topological spaces that use not only +vertices and edges but also higher-dimensional objects, naturally generalize +graphs and capture more than just pairwise relationships. Additionally, this +approach does not require prior knowledge of the neural activity beyond spike +counts, which removes the need for similarity measurements. The effectiveness +and versatility of the simplicial convolutional neural network is demonstrated +on head direction and trajectory prediction via head direction and grid cell +datasets. + +
+
+
+
+
+ + ♻ ☆ Unified Convergence Theory of Stochastic and Variance-Reduced Cubic + Newton Methods + + +
+ We study stochastic Cubic Newton methods for solving general possibly +non-convex minimization problems. We propose a new framework, which we call the +helper framework, that provides a unified view of the stochastic and +variance-reduced second-order algorithms equipped with global complexity +guarantees. It can also be applied to learning with auxiliary information. Our +helper framework offers the algorithm designer high flexibility for +constructing and analyzing the stochastic Cubic Newton methods, allowing +arbitrary size batches, and the use of noisy and possibly biased estimates of +the gradients and Hessians, incorporating both the variance reduction and the +lazy Hessian updates. We recover the best-known complexities for the stochastic +and variance-reduced Cubic Newton, under weak assumptions on the noise. A +direct consequence of our theory is the new lazy stochastic second-order +method, which significantly improves the arithmetic complexity for large +dimension problems. We also establish complexity bounds for the classes of +gradient-dominated objectives, that include convex and strongly convex +problems. For Auxiliary Learning, we show that using a helper (auxiliary +function) can outperform training alone if a given similarity measure is small. + +
+
+
+
+
+ + ♻ ☆ DynED: Dynamic Ensemble Diversification in Data Stream Classification CIKM '23 + + +
+ Ensemble methods are commonly used in classification due to their remarkable +performance. Achieving high accuracy in a data stream environment is a +challenging task considering disruptive changes in the data distribution, also +known as concept drift. A greater diversity of ensemble components is known to +enhance prediction accuracy in such settings. Despite the diversity of +components within an ensemble, not all contribute as expected to its overall +performance. This necessitates a method for selecting components that exhibit +high performance and diversity. We present a novel ensemble construction and +maintenance approach based on MMR (Maximal Marginal Relevance) that dynamically +combines the diversity and prediction accuracy of components during the process +of structuring an ensemble. The experimental results on both four real and 11 +synthetic datasets demonstrate that the proposed approach (DynED) provides a +higher average mean accuracy compared to the five state-of-the-art baselines. + +
+
+ comment: Proceedings of the 32nd ACM International Conference on Information + and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United + Kingdom +
+
+
+
+
+ + ♻ ☆ Infinite Class Mixup BMVC 2023 + + +
+ Mixup is a widely adopted strategy for training deep networks, where +additional samples are augmented by interpolating inputs and labels of training +pairs. Mixup has shown to improve classification performance, network +calibration, and out-of-distribution generalisation. While effective, a +cornerstone of Mixup, namely that networks learn linear behaviour patterns +between classes, is only indirectly enforced since the output interpolation is +performed at the probability level. This paper seeks to address this limitation +by mixing the classifiers directly instead of mixing the labels for each mixed +pair. We propose to define the target of each augmented sample as a uniquely +new classifier, whose parameters are a linear interpolation of the classifier +vectors of the input pair. The space of all possible classifiers is continuous +and spans all interpolations between classifier pairs. To make optimisation +tractable, we propose a dual-contrastive Infinite Class Mixup loss, where we +contrast the classifier of a mixed pair to both the classifiers and the +predicted outputs of other mixed pairs in a batch. Infinite Class Mixup is +generic in nature and applies to many variants of Mixup. Empirically, we show +that it outperforms standard Mixup and variants such as RegMixup and Remix on +balanced, long-tailed, and data-constrained benchmarks, highlighting its broad +applicability. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Learning Variational Models with Unrolling and Bilevel Optimization + + +
+ In this paper we consider the problem of learning variational models in the +context of supervised learning via risk minimization. Our goal is to provide a +deeper understanding of the two approaches of learning of variational models +via bilevel optimization and via algorithm unrolling. The former considers the +variational model as a lower level optimization problem below the risk +minimization problem, while the latter replaces the lower level optimization +problem by an algorithm that solves said problem approximately. Both approaches +are used in practice, but unrolling is much simpler from a computational point +of view. To analyze and compare the two approaches, we consider a simple toy +model, and compute all risks and the respective estimators explicitly. We show +that unrolling can be better than the bilevel optimization approach, but also +that the performance of unrolling can depend significantly on further +parameters, sometimes in unexpected ways: While the stepsize of the unrolled +algorithm matters a lot (and learning the stepsize gives a significant +improvement), the number of unrolled iterations plays a minor role. + +
+
+
+
+
+ + ♻ ☆ Deep Metric Learning with Chance Constraints WACV + + +
+ Deep metric learning (DML) aims to minimize empirical expected loss of the +pairwise intra-/inter- class proximity violations in the embedding space. We +relate DML to feasibility problem of finite chance constraints. We show that +minimizer of proxy-based DML satisfies certain chance constraints, and that the +worst case generalization performance of the proxy-based methods can be +characterized by the radius of the smallest ball around a class proxy to cover +the entire domain of the corresponding class samples, suggesting multiple +proxies per class helps performance. To provide a scalable algorithm as well as +exploiting more proxies, we consider the chance constraints implied by the +minimizers of proxy-based DML instances and reformulate DML as finding a +feasible point in intersection of such constraints, resulting in a problem to +be approximately solved by iterative projections. Simply put, we repeatedly +train a regularized proxy-based loss and re-initialize the proxies with the +embeddings of the deliberately selected new samples. We applied our method with +4 well-accepted DML losses and show the effectiveness with extensive +evaluations on 4 popular DML benchmarks. Code is available at: +https://github.com/yetigurbuz/ccp-dml + +
+
+ comment: Accepted as a conference paper at IEEE/CVF Winter Conference on + Applications of Computer Vision (WACV) 2024 +
+
+
+
+
+ + ♻ ☆ Improving Scientific Machine Learning via Attention and Multiple + Shooting + + +
+ Scientific Machine Learning (SciML) is a burgeoning field that +synergistically combines domain-aware and interpretable models with agnostic +machine learning techniques. In this work, we introduce GOKU-UI, an evolution +of the SciML generative model GOKU-nets. GOKU-UI not only broadens the original +model's spectrum to incorporate other classes of differential equations, such +as Stochastic Differential Equations (SDEs), but also integrates attention +mechanisms and a novel multiple shooting training strategy in the latent space. +These enhancements have led to a significant increase in its performance in +both reconstruction and forecast tasks, as demonstrated by our evaluation of +simulated and empirical data. Specifically, GOKU-UI outperformed all baseline +models on synthetic datasets even with a training set 16-fold smaller, +underscoring its remarkable data efficiency. Furthermore, when applied to +empirical human brain data, while incorporating stochastic Stuart-Landau +oscillators into its dynamical core, it not only surpassed all baseline methods +in the reconstruction task, but also demonstrated better prediction of future +brain activity up to 15 seconds ahead. By training GOKU-UI on resting state +fMRI data, we encoded whole-brain dynamics into a latent representation, +learning an effective low-dimensional dynamical system model that could offer +insights into brain functionality and open avenues for practical applications +such as the classification of mental states or psychiatric conditions. +Ultimately, our research provides further impetus for the field of Scientific +Machine Learning, showcasing the potential for advancements when established +scientific insights are interwoven with modern machine learning. + +
+
+
+
+
+ + ♻ ☆ Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition + and Translation + + +
+ In this paper, we devise a mechanism for the addition of multi-modal +information with an existing pipeline for continuous sign language recognition +and translation. In our procedure, we have incorporated optical flow +information with RGB images to enrich the features with movement-related +information. This work studies the feasibility of such modality inclusion using +a cross-modal encoder. The plugin we have used is very lightweight and doesn't +need to include a separate feature extractor for the new modality in an +end-to-end manner. We have applied the changes in both sign language +recognition and translation, improving the result in each case. We have +evaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language +recognition and the RWTH-PHOENIX-2014T dataset for translation. On the +recognition task, our approach reduced the WER by 0.9, and on the translation +task, our approach increased most of the BLEU scores by ~0.6 on the test set. + +
+
+ comment: This version has some errors. Our schedule is packed, so we don't + have enough time to correct it. We will share another work when we have time + to fix this +
+
+
+
+
+ + ♻ ☆ Towards provably efficient quantum algorithms for large-scale + machine-learning models + + +
+ Large machine learning models are revolutionary technologies of artificial +intelligence whose bottlenecks include huge computational expenses, power, and +time used both in the pre-training and fine-tuning process. In this work, we +show that fault-tolerant quantum computing could possibly provide provably +efficient resolutions for generic (stochastic) gradient descent algorithms, +scaling as $\mathcal{O}(T^2 \times \text{polylog}(n))$, where $n$ is the size +of the models and $T$ is the number of iterations in the training, as long as +the models are both sufficiently dissipative and sparse, with small learning +rates. Based on earlier efficient quantum algorithms for dissipative +differential equations, we find and prove that similar algorithms work for +(stochastic) gradient descent, the primary algorithm for machine learning. In +practice, we benchmark instances of large machine learning models from 7 +million to 103 million parameters. We find that, in the context of sparse +training, a quantum enhancement is possible at the early stage of learning +after model pruning, motivating a sparse parameter download and re-upload +scheme. Our work shows solidly that fault-tolerant quantum algorithms could +potentially contribute to most state-of-the-art, large-scale machine-learning +problems. + +
+
+ comment: 6+39 pages, 3+10 figures, substantial detail added +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Exploratory Learning-Aided Community Detection + in Networks with Unknown Topology CIKM 2022 + + +
+ In social networks, the discovery of community structures has received +considerable attention as a fundamental problem in various network analysis +tasks. However, due to privacy concerns or access restrictions, the network +structure is often unknown, thereby rendering established community detection +approaches ineffective without costly network topology acquisition. To tackle +this challenge, we present META-CODE, a unified framework for detecting +overlapping communities in networks with unknown topology via exploratory +learning aided by easy-to-collect node metadata. Specifically, META-CODE +consists of three iterative steps in addition to the initial network inference +step: 1) node-level community-affiliation embeddings based on graph neural +networks (GNNs) trained by our new reconstruction loss, 2) network exploration +via community-affiliation-based node queries, and 3) network inference using an +edge connectivity-based Siamese neural network model from the explored network. +Through extensive experiments on five real-world datasets including two large +networks, we demonstrated: (a) the superiority of META-CODE over benchmark +community detection methods, achieving remarkable gains up to 151.27% compared +to the best existing competitor, (b) the impact of each module in META-CODE, +(c) the effectiveness of node queries in META-CODE based on empirical +evaluations and theoretical findings, (d) the convergence of the inferred +network, and (e) the computational efficiency of META-CODE. + +
+
+ comment: 16 pages, 9 figures, 6 tables; its conference version was presented + at the ACM International Conference on Information and Knowledge Management + (CIKM 2022) +
+
+
+
+
+ + ♻ ☆ Learning solution of nonlinear constitutive material models using + physics-informed neural networks: COMM-PINN + + +
+ We applied physics-informed neural networks to solve the constitutive +relations for nonlinear, path-dependent material behavior. As a result, the +trained network not only satisfies all thermodynamic constraints but also +instantly provides information about the current material state (i.e., free +energy, stress, and the evolution of internal variables) under any given +loading scenario without requiring initial data. One advantage of this work is +that it bypasses the repetitive Newton iterations needed to solve nonlinear +equations in complex material models. Additionally, strategies are provided to +reduce the required order of derivative for obtaining the tangent operator. The +trained model can be directly used in any finite element package (or other +numerical methods) as a user-defined material model. However, challenges remain +in the proper definition of collocation points and in integrating several +non-equality constraints that become active or non-active simultaneously. We +tested this methodology on rate-independent processes such as the classical von +Mises plasticity model with a nonlinear hardening law, as well as local damage +models for interface cracking behavior with a nonlinear softening law. In order +to demonstrate the applicability of the methodology in handling complex path +dependency in a three-dimensional (3D) scenario, we tested the approach using +the equations governing a damage model for a three-dimensional interface model. +Such models are frequently employed for intergranular fracture at grain +boundaries. We have observed a perfect agreement between the results obtained +through the proposed methodology and those obtained using the classical +approach. Furthermore, the proposed approach requires significantly less effort +in terms of implementation and computing time compared to the traditional +methods. + +
+
+
+
+
+ + ♻ ☆ Error Scaling Laws for Kernel Classification under Source and Capacity + Conditions + + +
+ We consider the problem of kernel classification. While worst-case bounds on +the decay rate of the prediction error with the number of samples are known for +some classifiers, they often fail to accurately describe the learning curves of +real data sets. In this work, we consider the important class of data sets +satisfying the standard source and capacity conditions, comprising a number of +real data sets as we show numerically. Under the Gaussian design, we derive the +decay rates for the misclassification (prediction) error as a function of the +source and capacity coefficients. We do so for two standard kernel +classification settings, namely margin-maximizing Support Vector Machines (SVM) +and ridge classification, and contrast the two methods. We find that our rates +tightly describe the learning curves for this class of data sets, and are also +observed on real data. Our results can also be seen as an explicit prediction +of the exponents of a scaling law for kernel classification that is accurate on +some real datasets. + +
+
+
+
+
+ + ♻ ☆ Unifying Synergies between Self-supervised Learning and Dynamic + Computation + + +
+ Computationally expensive training strategies make self-supervised learning +(SSL) impractical for resource constrained industrial settings. Techniques like +knowledge distillation (KD), dynamic computation (DC), and pruning are often +used to obtain a lightweightmodel, which usually involves multiple epochs of +fine-tuning (or distilling steps) of a large pre-trained model, making it more +computationally challenging. In this work we present a novel perspective on the +interplay between SSL and DC paradigms. In particular, we show that it is +feasible to simultaneously learn a dense and gated sub-network from scratch in +a SSL setting without any additional fine-tuning or pruning steps. The +co-evolution during pre-training of both dense and gated encoder offers a good +accuracy-efficiency trade-off and therefore yields a generic and multi-purpose +architecture for application specific industrial settings. Extensive +experiments on several image classification benchmarks including CIFAR-10/100, +STL-10 and ImageNet-100, demonstrate that the proposed training strategy +provides a dense and corresponding gated sub-network that achieves on-par +performance compared with the vanilla self-supervised setting, but at a +significant reduction in computation in terms of FLOPs, under a range of target +budgets (td ). + +
+
+
+
+
+ + ♻ ☆ Estimating Gibbs free energies via isobaric-isothermal flows + + +
+ We present a machine-learning model based on normalizing flows that is +trained to sample from the isobaric-isothermal ensemble. In our approach, we +approximate the joint distribution of a fully-flexible triclinic simulation box +and particle coordinates to achieve a desired internal pressure. This novel +extension of flow-based sampling to the isobaric-isothermal ensemble yields +direct estimates of Gibbs free energies. We test our NPT-flow on monatomic +water in the cubic and hexagonal ice phases and find excellent agreement of +Gibbs free energies and other observables compared with established baselines. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Efficient Query-Based Attack against ML-Based Android Malware Detection + under Zero Knowledge Setting + + +
+ The widespread adoption of the Android operating system has made malicious +Android applications an appealing target for attackers. Machine learning-based +(ML-based) Android malware detection (AMD) methods are crucial in addressing +this problem; however, their vulnerability to adversarial examples raises +concerns. Current attacks against ML-based AMD methods demonstrate remarkable +performance but rely on strong assumptions that may not be realistic in +real-world scenarios, e.g., the knowledge requirements about feature space, +model parameters, and training dataset. To address this limitation, we +introduce AdvDroidZero, an efficient query-based attack framework against +ML-based AMD methods that operates under the zero knowledge setting. Our +extensive evaluation shows that AdvDroidZero is effective against various +mainstream ML-based AMD methods, in particular, state-of-the-art such methods +and real-world antivirus solutions. + +
+
+ comment: To Appear in the ACM Conference on Computer and Communications + Security, November, 2023 +
+
+
+
+
+ + ♻ ☆ On Optimal Regularization Parameters via Bilevel Learning + + +
+ Variational regularization is commonly used to solve linear inverse problems, +and involves augmenting a data fidelity by a regularizer. The regularizer is +used to promote a priori information and is weighted by a regularization +parameter. Selection of an appropriate regularization parameter is critical, +with various choices leading to very different reconstructions. Classical +strategies used to determine a suitable parameter value include the discrepancy +principle and the L-curve criterion, and in recent years a supervised machine +learning approach called bilevel learning has been employed. Bilevel learning +is a powerful framework to determine optimal parameters and involves solving a +nested optimization problem. While previous strategies enjoy various +theoretical results, the well-posedness of bilevel learning in this setting is +still an open question. In particular, a necessary property is positivity of +the determined regularization parameter. In this work, we provide a new +condition that better characterizes positivity of optimal regularization +parameters than the existing theory. Numerical results verify and explore this +new condition for both small and high-dimensional problems. + +
+
+ comment: 32 pages, 11 figures. Restructured and streamlined proof. Provided + more numerical results +
+
+
+
+
+ + ♻ ☆ Listen to Minority: Encrypted Traffic Classification for Class Imbalance + with Contrastive Pre-Training + + +
+ Mobile Internet has profoundly reshaped modern lifestyles in various aspects. +Encrypted Traffic Classification (ETC) naturally plays a crucial role in +managing mobile Internet, especially with the explosive growth of mobile apps +using encrypted communication. Despite some existing learning-based ETC methods +showing promising results, three-fold limitations still remain in real-world +network environments, 1) label bias caused by traffic class imbalance, 2) +traffic homogeneity caused by component sharing, and 3) training with reliance +on sufficient labeled traffic. None of the existing ETC methods can address all +these limitations. In this paper, we propose a novel Pre-trAining +Semi-Supervised ETC framework, dubbed PASS. Our key insight is to resample the +original train dataset and perform contrastive pre-training without using +individual app labels directly to avoid label bias issues caused by class +imbalance, while obtaining a robust feature representation to differentiate +overlapping homogeneous traffic by pulling positive traffic pairs closer and +pushing negative pairs away. Meanwhile, PASS designs a semi-supervised +optimization strategy based on pseudo-label iteration and dynamic loss +weighting algorithms in order to effectively utilize massive unlabeled traffic +data and alleviate manual train dataset annotation workload. PASS outperforms +state-of-the-art ETC methods and generic sampling approaches on four public +datasets with significant class imbalance and traffic homogeneity, remarkably +pushing the F1 of Cross-Platform215 with 1.31%, ISCX-17 with 9.12%. +Furthermore, we validate the generality of the contrastive pre-training and +pseudo-label iteration components of PASS, which can adaptively benefit ETC +methods with diverse feature extractors. + +
+
+ comment: Accepted by 2023 20th Annual IEEE International Conference on + Sensing, Communication, and Networking, 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Fake detection in imbalance dataset by Semi-supervised learning with GAN + + +
+ As social media grows faster, harassment becomes more prevalent which leads +to considered fake detection a fascinating field among researchers. The graph +nature of data with the large number of nodes caused different obstacles +including a considerable amount of unrelated features in matrices as high +dispersion and imbalance classes in the dataset. To deal with these issues +Auto-encoders and a combination of semi-supervised learning and the GAN +algorithm which is called SGAN were used. This paper is deploying a smaller +number of labels and applying SGAN as a classifier. The result of this test +showed that the accuracy had reached 91\% in detecting fake accounts using only +100 labeled samples. + +
+
+ comment: have a more complete script in this subject +
+
+
+
+
+ + ♻ ☆ An Informative Path Planning Framework for Active Learning in UAV-based + Semantic Mapping + + +
+ Unmanned aerial vehicles (UAVs) are frequently used for aerial mapping and +general monitoring tasks. Recent progress in deep learning enabled automated +semantic segmentation of imagery to facilitate the interpretation of +large-scale complex environments. Commonly used supervised deep learning for +segmentation relies on large amounts of pixel-wise labelled data, which is +tedious and costly to annotate. The domain-specific visual appearance of aerial +environments often prevents the usage of models pre-trained on publicly +available datasets. To address this, we propose a novel general planning +framework for UAVs to autonomously acquire informative training images for +model re-training. We leverage multiple acquisition functions and fuse them +into probabilistic terrain maps. Our framework combines the mapped acquisition +function information into the UAV's planning objectives. In this way, the UAV +adaptively acquires informative aerial images to be manually labelled for model +re-training. Experimental results on real-world data and in a photorealistic +simulation show that our framework maximises model performance and drastically +reduces labelling efforts. Our map-based planners outperform state-of-the-art +local planning. + +
+
+ comment: 18 pages, 24 figures +
+
+
+
+
+ + ♻ ☆ Deep learning based ECG segmentation for delineation of diverse + arrhythmias + + +
+ Accurate delineation of key waveforms in an ECG is a critical initial step in +extracting relevant features to support the diagnosis and treatment of heart +conditions. Although deep learning based methods using a segmentation model to +locate the P, QRS, and T waves have shown promising results, their ability to +handle signals exhibiting arrhythmia remains unclear. This study builds on +existing research by introducing a U-Net-like segmentation model for ECG +delineation, with a particular focus on diverse arrhythmias. For this purpose, +we curate an internal dataset containing waveform boundary annotations for +various arrhythmia types to train and validate our model. Our key contributions +include identifying segmentation model failures in different arrhythmia types, +developing a robust model using a diverse training set, achieving comparable +performance on benchmark datasets, and introducing a classification guided +strategy to reduce false P wave predictions for specific arrhythmias. This +study advances deep learning based ECG delineation in the context of +arrhythmias and highlights its challenges. + +
+
+
+
+
+ + ♻ ☆ An Empirical Analysis for Zero-Shot Multi-Label Classification on + COVID-19 CT Scans and Uncurated Reports ICCV + + +
+ The pandemic resulted in vast repositories of unstructured data, including +radiology reports, due to increased medical examinations. Previous research on +automated diagnosis of COVID-19 primarily focuses on X-ray images, despite +their lower precision compared to computed tomography (CT) scans. In this work, +we leverage unstructured data from a hospital and harness the fine-grained +details offered by CT scans to perform zero-shot multi-label classification +based on contrastive visual language learning. In collaboration with human +experts, we investigate the effectiveness of multiple zero-shot models that aid +radiologists in detecting pulmonary embolisms and identifying intricate lung +details like ground glass opacities and consolidations. Our empirical analysis +provides an overview of the possible solutions to target such fine-grained +tasks, so far overlooked in the medical multimodal pretraining literature. Our +investigation promises future advancements in the medical image analysis +community by addressing some challenges associated with unstructured data and +fine-grained multi-label classification. + +
+
+ comment: Proceedings of the IEEE/CVF International Conference on Computer + Vision (ICCV) Workshops 2023 +
+
+
+
+
+ + ♻ ☆ Benchmarks for Detecting Measurement Tampering + + +
+ When training powerful AI systems to perform complex tasks, it may be +challenging to provide training signals which are robust to optimization. One +concern is \textit{measurement tampering}, where the AI system manipulates +multiple measurements to create the illusion of good results instead of +achieving the desired outcome. In this work, we build four new text-based +datasets to evaluate measurement tampering detection techniques on large +language models. Concretely, given sets of text inputs and measurements aimed +at determining if some outcome occurred, as well as a base model able to +accurately predict measurements, the goal is to determine if examples where all +measurements indicate the outcome occurred actually had the outcome occur, or +if this was caused by measurement tampering. We demonstrate techniques that +outperform simple baselines on most datasets, but don't achieve maximum +performance. We believe there is significant room for improvement for both +techniques and datasets, and we are excited for future work tackling +measurement tampering. + +
+
+ comment: Edit: extended and improved appendices, fixed references +
+
+
+
+
+ + ♻ ☆ An XAI framework for robust and transparent data-driven wind turbine + power curve models + + +
+ Wind turbine power curve models translate ambient conditions into turbine +power output. They are essential for energy yield prediction and turbine +performance monitoring. In recent years, increasingly complex machine learning +methods have become state-of-the-art for this task. Nevertheless, they +frequently encounter criticism due to their apparent lack of transparency, +which raises concerns regarding their performance in non-stationary +environments, such as those faced by wind turbines. We, therefore, introduce an +explainable artificial intelligence (XAI) framework to investigate and validate +strategies learned by data-driven power curve models from operational wind +turbine data. With the help of simple, physics-informed baseline models it +enables an automated evaluation of machine learning models beyond standard +error metrics. Alongside this novel tool, we present its efficacy for a more +informed model selection. We show, for instance, that learned strategies can be +meaningful indicators for a model's generalization ability in addition to test +set errors, especially when only little data is available. Moreover, the +approach facilitates an understanding of how decisions along the machine +learning pipeline, such as data selection, pre-processing, or training +parameters, affect learned strategies. In a practical example, we demonstrate +the framework's utilisation to obtain more physically meaningful models, a +prerequisite not only for robustness but also for insights into turbine +operation by domain experts. The latter, we demonstrate in the context of wind +turbine performance monitoring. Alongside this paper, we publish a Python +implementation of the presented framework and hope this can guide researchers +and practitioners alike toward training, selecting and utilizing more +transparent and robust data-driven wind turbine power curve models. + +
+
+ comment: 42 pages, 16 figures, revised version +
+
+
+
+
+ + ♻ ☆ Dynamic Graph Convolutional Network with Attention Fusion for Traffic + Flow Prediction ECAI 2023 + + +
+ Accurate and real-time traffic state prediction is of great practical +importance for urban traffic control and web mapping services. With the support +of massive data, deep learning methods have shown their powerful capability in +capturing the complex spatialtemporal patterns of traffic networks. However, +existing approaches use pre-defined graphs and a simple set of spatial-temporal +components, making it difficult to model multi-scale spatial-temporal +dependencies. In this paper, we propose a novel dynamic graph convolution +network with attention fusion to tackle this gap. The method first enhances the +interaction of temporal feature dimensions, and then it combines a dynamic +graph learner with GRU to jointly model synchronous spatial-temporal +correlations. We also incorporate spatial-temporal attention modules to +effectively capture longrange, multifaceted domain spatial-temporal patterns. +We conduct extensive experiments in four real-world traffic datasets to +demonstrate that our method surpasses state-of-the-art performance compared to +18 baseline methods. + +
+
+ comment: 8 pages, 5 figure, accepted by ECAI 2023 +
+
+
+
+
+ + ♻ ☆ TSMixer: An All-MLP Architecture for Time Series Forecasting + + +
+ Real-world time-series datasets are often multivariate with complex dynamics. +To capture this complexity, high capacity architectures like recurrent- or +attention-based sequential deep learning models have become popular. However, +recent work demonstrates that simple univariate linear models can outperform +such deep learning models on several commonly used academic benchmarks. +Extending them, in this paper, we investigate the capabilities of linear models +for time-series forecasting and present Time-Series Mixer (TSMixer), a novel +architecture designed by stacking multi-layer perceptrons (MLPs). TSMixer is +based on mixing operations along both the time and feature dimensions to +extract information efficiently. On popular academic benchmarks, the +simple-to-implement TSMixer is comparable to specialized state-of-the-art +models that leverage the inductive biases of specific benchmarks. On the +challenging and large scale M5 benchmark, a real-world retail dataset, TSMixer +demonstrates superior performance compared to the state-of-the-art +alternatives. Our results underline the importance of efficiently utilizing +cross-variate and auxiliary information for improving the performance of time +series forecasting. We present various analyses to shed light into the +capabilities of TSMixer. The design paradigms utilized in TSMixer are expected +to open new horizons for deep learning-based time series forecasting. The +implementation is available at +https://github.com/google-research/google-research/tree/master/tsmixer + +
+
+
+
+
+ + ♻ ☆ AnoOnly: Semi-Supervised Anomaly Detection with the Only Loss on + Anomalies + + +
+ Semi-supervised anomaly detection (SSAD) methods have demonstrated their +effectiveness in enhancing unsupervised anomaly detection (UAD) by leveraging +few-shot but instructive abnormal instances. However, the dominance of +homogeneous normal data over anomalies biases the SSAD models against +effectively perceiving anomalies. To address this issue and achieve balanced +supervision between heavily imbalanced normal and abnormal data, we develop a +novel framework called AnoOnly (Anomaly Only). Unlike existing SSAD methods +that resort to strict loss supervision, AnoOnly suspends it and introduces a +form of weak supervision for normal data. This weak supervision is instantiated +through the utilization of batch normalization, which implicitly performs +cluster learning on normal data. When integrated into existing SSAD methods, +the proposed AnoOnly demonstrates remarkable performance enhancements across +various models and datasets, achieving new state-of-the-art performance. +Additionally, our AnoOnly is natively robust to label noise when suffering from +data contamination. Our code is publicly available at +https://github.com/cool-xuan/AnoOnly. + +
+
+
+
+
+ + ♻ ☆ Grounding Large Language Models in Interactive Environments with Online + Reinforcement Learning + + +
+ Recent works successfully leveraged Large Language Models' (LLM) abilities to +capture abstract knowledge about world's physics to solve decision-making +problems. Yet, the alignment between LLMs' knowledge and the environment can be +wrong and limit functional competence due to lack of grounding. In this paper, +we study an approach (named GLAM) to achieve this alignment through functional +grounding: we consider an agent using an LLM as a policy that is progressively +updated as the agent interacts with the environment, leveraging online +Reinforcement Learning to improve its performance to solve goals. Using an +interactive textual environment designed to study higher-level forms of +functional grounding, and a set of spatial and navigation tasks, we study +several scientific questions: 1) Can LLMs boost sample efficiency for online +learning of various RL tasks? 2) How can it boost different forms of +generalization? 3) What is the impact of online learning? We study these +questions by functionally grounding several variants (size, architecture) of +FLAN-T5. + +
+
+
+
+
+ + ♻ ☆ StratMed: Relevance Stratification for Low-resource Medication + Recommendation + + +
+ With the growing imbalance between limited medical resources and escalating +demands, AI-based clinical tasks have become paramount. Medication +recommendation, as a sub-domain, aims to amalgamate longitudinal patient +history with medical knowledge, assisting physicians in prescribing safer and +more accurate medication combinations. Existing methods overlook the inherent +long-tail distribution in medical data, lacking balanced representation between +head and tail data, which leads to sub-optimal model performance. To address +this challenge, we introduce StratMed, a model that incorporates an innovative +relevance stratification mechanism. It harmonizes discrepancies in data +long-tail distribution and strikes a balance between the safety and accuracy of +medication combinations. Specifically, we first construct a pre-training method +using deep learning networks to obtain entity representation. After that, we +design a pyramid-like data stratification method to obtain more generalized +entity relationships by reinforcing the features of unpopular entities. Based +on this relationship, we designed two graph structures to express medication +precision and safety at the same level to obtain visit representations. +Finally, the patient's historical clinical information is fitted to generate +medication combinations for the current health condition. Experiments on the +MIMIC-III dataset demonstrate that our method has outperformed current +state-of-the-art methods in four evaluation metrics (including safety and +accuracy). + +
+
+
+
+
+ + ♻ ☆ An Efficient 1 Iteration Learning Algorithm for Gaussian Mixture Model + And Gaussian Mixture Embedding For Neural Network + + +
+ We propose an Gaussian Mixture Model (GMM) learning algorithm, based on our +previous work of GMM expansion idea. The new algorithm brings more robustness +and simplicity than classic Expectation Maximization (EM) algorithm. It also +improves the accuracy and only take 1 iteration for learning. We theoretically +proof that this new algorithm is guarantee to converge regardless the +parameters initialisation. We compare our GMM expansion method with classic +probability layers in neural network leads to demonstrably better capability to +overcome data uncertainty and inverse problem. Finally, we test GMM based +generator which shows a potential to build further application that able to +utilized distribution random sampling for stochastic variation as well as +variation control. + +
+
+
+
+
+ + ♻ ☆ Kernel Random Projection Depth for Outlier Detection SC 2023 + + +
+ This paper proposes an extension of Random Projection Depth (RPD) to cope +with multiple modalities and non-convexity on data clouds. In the framework of +the proposed method, the RPD is computed in a reproducing kernel Hilbert space. +With the help of kernel principal component analysis, we expect that the +proposed method can cope with the above multiple modalities and non-convexity. +The experimental results demonstrate that the proposed method outperforms RPD +and is comparable to other existing detection models on benchmark datasets +regarding Area Under the Curves (AUCs) of Receiver Operating Characteristic +(ROC). + +
+
+ comment: Accepted to APSIPA ASC 2023 +
+
+
+
+
+ + ♻ ☆ Transferable Time-Series Forecasting under Causal Conditional Shift + + +
+ This paper focuses on the problem of semi-supervised domain adaptation for +time-series forecasting, which is underexplored in literatures, despite being +often encountered in practice. Existing methods on time-series domain +adaptation mainly follow the paradigm designed for the static data, which +cannot handle domain-specific complex conditional dependencies raised by data +offset, time lags, and variant data distributions. In order to address these +challenges, we analyze variational conditional dependencies in time-series data +and find that the causal structures are usually stable among domains, and +further raise the causal conditional shift assumption. Enlightened by this +assumption, we consider the causal generation process for time-series data and +propose an end-to-end model for the semi-supervised domain adaptation problem +on time-series forecasting. Our method can not only discover the Granger-Causal +structures among cross-domain data but also address the cross-domain +time-series forecasting problem with accurate and interpretable predicted +results. We further theoretically analyze the superiority of the proposed +method, where the generalization error on the target domain is bounded by the +empirical risks and by the discrepancy between the causal structures from +different domains. Experimental results on both synthetic and real data +demonstrate the effectiveness of our method for the semi-supervised domain +adaptation method on time-series forecasting. + +
+
+ comment: TPAMI2023 Accepted +
+
+
+
+
+ + ♻ ☆ CONFIDERAI: a novel CONFormal Interpretable-by-Design score function for + Explainable and Reliable Artificial Intelligence + + +
+ Everyday life is increasingly influenced by artificial intelligence, and +there is no question that machine learning algorithms must be designed to be +reliable and trustworthy for everyone. Specifically, computer scientists +consider an artificial intelligence system safe and trustworthy if it fulfills +five pillars: explainability, robustness, transparency, fairness, and privacy. +In addition to these five, we propose a sixth fundamental aspect: conformity, +that is, the probabilistic assurance that the system will behave as the machine +learner expects. In this paper, we propose a methodology to link conformal +prediction with explainable machine learning by defining CONFIDERAI, a new +score function for rule-based models that leverages both rules predictive +ability and points geometrical position within rules boundaries. We also +address the problem of defining regions in the feature space where conformal +guarantees are satisfied by exploiting techniques to control the number of +non-conformal samples in conformal regions based on support vector data +description (SVDD). The overall methodology is tested with promising results on +benchmark and real datasets, such as DNS tunneling detection or cardiovascular +disease prediction. + +
+
+ comment: 12 pages, 7 figures, 1 algorithm, international journal +
+
+
+
+
+ + ♻ ☆ Understanding convolution on graphs via energies + + +
+ Graph Neural Networks (GNNs) typically operate by message-passing, where the +state of a node is updated based on the information received from its +neighbours. Most message-passing models act as graph convolutions, where +features are mixed by a shared, linear transformation before being propagated +over the edges. On node-classification tasks, graph convolutions have been +shown to suffer from two limitations: poor performance on heterophilic graphs, +and over-smoothing. It is common belief that both phenomena occur because such +models behave as low-pass filters, meaning that the Dirichlet energy of the +features decreases along the layers incurring a smoothing effect that +ultimately makes features no longer distinguishable. In this work, we +rigorously prove that simple graph-convolutional models can actually enhance +high frequencies and even lead to an asymptotic behaviour we refer to as +over-sharpening, opposite to over-smoothing. We do so by showing that linear +graph convolutions with symmetric weights minimize a multi-particle energy that +generalizes the Dirichlet energy; in this setting, the weight matrices induce +edge-wise attraction (repulsion) through their positive (negative) eigenvalues, +thereby controlling whether the features are being smoothed or sharpened. We +also extend the analysis to non-linear GNNs, and demonstrate that some existing +time-continuous GNNs are instead always dominated by the low frequencies. +Finally, we validate our theoretical findings through ablations and real-world +experiments. + +
+
+ comment: Accepted at TMLR; First two authors equal contribution; 35 pages +
+
+
+
+
+ + ♻ ☆ Graph Convolutional Neural Networks with Diverse Negative Samples via + Decomposed Determinant Point Processes + + +
+ Graph convolutional networks (GCNs) have achieved great success in graph +representation learning by extracting high-level features from nodes and their +topology. Since GCNs generally follow a message-passing mechanism, each node +aggregates information from its first-order neighbour to update its +representation. As a result, the representations of nodes with edges between +them should be positively correlated and thus can be considered positive +samples. However, there are more non-neighbour nodes in the whole graph, which +provide diverse and useful information for the representation update. Two +non-adjacent nodes usually have different representations, which can be seen as +negative samples. Besides the node representations, the structural information +of the graph is also crucial for learning. In this paper, we used +quality-diversity decomposition in determinant point processes (DPP) to obtain +diverse negative samples. When defining a distribution on diverse subsets of +all non-neighbouring nodes, we incorporate both graph structure information and +node representations. Since the DPP sampling process requires matrix eigenvalue +decomposition, we propose a new shortest-path-base method to improve +computational efficiency. Finally, we incorporate the obtained negative samples +into the graph convolution operation. The ideas are evaluated empirically in +experiments on node classification tasks. These experiments show that the newly +proposed methods not only improve the overall performance of standard +representation learning but also significantly alleviate over-smoothing +problems. + +
+
+ comment: Accepted by IEEE TNNLS on 30-Aug-2023. arXiv admin note: text overlap + with arXiv:2210.00728 +
+
+
+
+
+ + ♻ ☆ DoRA: Domain-Based Self-Supervised Learning Framework for Low-Resource + Real Estate Appraisal CIKM 2023 + + +
+ The marketplace system connecting demands and supplies has been explored to +develop unbiased decision-making in valuing properties. Real estate appraisal +serves as one of the high-cost property valuation tasks for financial +institutions since it requires domain experts to appraise the estimation based +on the corresponding knowledge and the judgment of the market. Existing +automated valuation models reducing the subjectivity of domain experts require +a large number of transactions for effective evaluation, which is predominantly +limited to not only the labeling efforts of transactions but also the +generalizability of new developing and rural areas. To learn representations +from unlabeled real estate sets, existing self-supervised learning (SSL) for +tabular data neglects various important features, and fails to incorporate +domain knowledge. In this paper, we propose DoRA, a Domain-based +self-supervised learning framework for low-resource Real estate Appraisal. DoRA +is pre-trained with an intra-sample geographic prediction as the pretext task +based on the metadata of the real estate for equipping the real estate +representations with prior domain knowledge. Furthermore, inter-sample +contrastive learning is employed to generalize the representations to be robust +for limited transactions of downstream tasks. Our benchmark results on three +property types of real-world transactions show that DoRA significantly +outperforms the SSL baselines for tabular data, the graph-based methods, and +the supervised approaches in the few-shot scenarios by at least 7.6% for MAPE, +11.59% for MAE, and 3.34% for HR10%. We expect DoRA to be useful to other +financial practitioners with similar marketplace applications who need general +models for properties that are newly built and have limited records. The source +code is available at https://github.com/wwweiwei/DoRA. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ LUT-NN: Empower Efficient Neural Network Inference with Centroid + Learning and Table Lookup + + +
+ On-device Deep Neural Network (DNN) inference consumes significant computing +resources and development efforts. To alleviate that, we propose LUT-NN, the +first system to empower inference by table lookup, to reduce inference cost. +LUT-NN learns the typical features for each operator, named centroid, and +precompute the results for these centroids to save in lookup tables. During +inference, the results of the closest centroids with the inputs can be read +directly from the table, as the approximated outputs without computations. +LUT-NN integrates two major novel techniques: (1) differentiable centroid +learning through backpropagation, which adapts three levels of approximation to +minimize the accuracy impact by centroids; (2) table lookup inference +execution, which comprehensively considers different levels of parallelism, +memory access reduction, and dedicated hardware units for optimal performance. +LUT-NN is evaluated on multiple real tasks, covering image and speech +recognition, and nature language processing. Compared to related work, LUT-NN +improves accuracy by 66% to 92%, achieving similar level with the original +models. LUT-NN reduces the cost at all dimensions, including FLOPs ($\leq$ +16x), model size ($\leq$ 7x), latency ($\leq$ 6.8x), memory ($\leq$ 6.5x), and +power ($\leq$ 41.7%). + +
+
+
+
+
+ + ♻ ☆ Recent Advances and Applications of Machine Learning in Experimental + Solid Mechanics: A Review + + +
+ For many decades, experimental solid mechanics has played a crucial role in +characterizing and understanding the mechanical properties of natural and novel +materials. Recent advances in machine learning (ML) provide new opportunities +for the field, including experimental design, data analysis, uncertainty +quantification, and inverse problems. As the number of papers published in +recent years in this emerging field is exploding, it is timely to conduct a +comprehensive and up-to-date review of recent ML applications in experimental +solid mechanics. Here, we first provide an overview of common ML algorithms and +terminologies that are pertinent to this review, with emphasis placed on +physics-informed and physics-based ML methods. Then, we provide thorough +coverage of recent ML applications in traditional and emerging areas of +experimental mechanics, including fracture mechanics, biomechanics, nano- and +micro-mechanics, architected materials, and 2D material. Finally, we highlight +some current challenges of applying ML to multi-modality and multi-fidelity +experimental datasets and propose several future research directions. This +review aims to provide valuable insights into the use of ML methods as well as +a variety of examples for researchers in solid mechanics to integrate into +their experiments. + +
+
+ comment: 93 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Differentiable Bayesian Structure Learning with Acyclicity Assurance ICDM 2023 + + +
+ Score-based approaches in the structure learning task are thriving because of +their scalability. Continuous relaxation has been the key reason for this +advancement. Despite achieving promising outcomes, most of these methods are +still struggling to ensure that the graphs generated from the latent space are +acyclic by minimizing a defined score. There has also been another trend of +permutation-based approaches, which concern the search for the topological +ordering of the variables in the directed acyclic graph in order to limit the +search space of the graph. In this study, we propose an alternative approach +for strictly constraining the acyclicty of the graphs with an integration of +the knowledge from the topological orderings. Our approach can reduce inference +complexity while ensuring the structures of the generated graphs to be acyclic. +Our empirical experiments with simulated and real-world data show that our +approach can outperform related Bayesian score-based approaches. + +
+
+ comment: Accepted as a regular paper (9.37%) at the 23rd IEEE International + Conference on Data Mining (ICDM 2023) +
+
+
+
+
+ + ♻ ☆ MvFS: Multi-view Feature Selection for Recommender System CIKM 2023 + + +
+ Feature selection, which is a technique to select key features in recommender +systems, has received increasing research attention. Recently, Adaptive Feature +Selection (AdaFS) has shown remarkable performance by adaptively selecting +features for each data instance, considering that the importance of a given +feature field can vary significantly across data. However, this method still +has limitations in that its selection process could be easily biased to major +features that frequently occur. To address these problems, we propose +Multi-view Feature Selection (MvFS), which selects informative features for +each instance more effectively. Most importantly, MvFS employs a multi-view +network consisting of multiple sub-networks, each of which learns to measure +the feature importance of a part of data with different feature patterns. By +doing so, MvFS mitigates the bias problem towards dominant patterns and +promotes a more balanced feature selection process. Moreover, MvFS adopts an +effective importance score modeling strategy which is applied independently to +each field without incurring dependency among features. Experimental results on +real-world datasets demonstrate the effectiveness of MvFS compared to +state-of-the-art baselines. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Estimating 3D Dental Structures using Simulated Panoramic Radiographs + and Neural Ray Tracing + + +
+ Panoramic radiography (Panoramic X-ray, PX) is a widely used imaging modality +for dental examination. However, PX only provides a flattened 2D image, lacking +in a 3D view of the oral structure. In this paper, we propose a framework to +estimate 3D oral structures from real-world PX. Our framework tackles full 3D +reconstruction for varying subjects (patients) where each reconstruction is +based only on a single panoramic image. We create an intermediate +representation called simulated PX (SimPX) from 3D Cone-beam computed +tomography (CBCT) data based on the Beer-Lambert law of X-ray rendering and +rotational principles of PX imaging. SimPX aims at not only truthfully +simulating PX, but also facilitates the reverting process back to 3D data. We +propose a novel neural model based on ray tracing which exploits both global +and local input features to convert SimPX to 3D output. At inference, a real PX +image is translated to a SimPX-style image with semantic regularization, and +the translated image is processed by generation module to produce high-quality +outputs. Experiments show that our method outperforms prior state-of-the-art in +reconstruction tasks both quantitatively and qualitatively. Unlike prior +methods, Our method does not require any prior information such as the shape of +dental arches, nor the matched PX-CBCT dataset for training, which is difficult +to obtain in clinical practice. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Clustered Federated Learning based on Nonconvex Pairwise Fusion + + +
+ This study investigates clustered federated learning (FL), one of the +formulations of FL with non-i.i.d. data, where the devices are partitioned into +clusters and each cluster optimally fits its data with a localized model. We +propose a clustered FL framework that incorporates a nonconvex penalty to +pairwise differences of parameters. This framework can automatically identify +cluster structures without a priori knowledge of the number of clusters and the +set of devices in each cluster. To implement the proposed framework, we +introduce a novel clustered FL method called Fusion Penalized Federated +Clustering (FPFC). Building upon the standard alternating direction method of +multipliers (ADMM), FPFC is implemented in parallel, updates only a subset of +devices at each communication round, and allows for variable workload per +device. These strategies significantly reduce the communication cost while +ensuring privacy, making it practical for FL. We also propose a new warmup +strategy for hyperparameter tuning in FL settings and explore the asynchronous +variant of FPFC (asyncFPFC). Theoretical analysis provides convergence +guarantees for FPFC with general nonconvex losses and establishes the +statistical convergence rate under a linear model with squared loss. Extensive +experiments demonstrate the advantages of FPFC over existing methods, including +robustness and generalization capability. + +
+
+ comment: 46 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ An Efficient Approach to Unsupervised Out-of-Distribution Detection with + Variational Autoencoders + + +
+ This paper is concerned with deep generative models (DGMs) for unsupervised +out-of-distribution (OOD) detection. In particular, we focus on vanilla +Variational Autoencoders (VAE) that use a standard normal prior distribution +for the latent variables. These models have a smaller model size, enabling +faster training and inference, making them well-suited for resource-limited +applications compared to more complex DGMs. We propose a novel OOD score called +Error Reduction (ER) specifically designed for vanilla VAE. ER incorporate the +idea of reconstructing image inputs from their lossy counterparts and takes +into account the Kolmogorov complexity of the images. Experimental results on +diverse datasets demonstrate the superiority of our approach over baseline +methods. Our code is available at: https://github.com/ZJLAB-AMMI/VAE4OOD. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ Separable Hamiltonian Neural Networks + + +
+ The modelling of dynamical systems from discrete observations is a challenge +faced by modern scientific and engineering data systems. Hamiltonian systems +are one such fundamental and ubiquitous class of dynamical systems. Hamiltonian +neural networks are state-of-the-art models that unsupervised-ly regress the +Hamiltonian of a dynamical system from discrete observations of its vector +field under the learning bias of Hamilton's equations. Yet Hamiltonian dynamics +are often complicated, especially in higher dimensions where the state space of +the Hamiltonian system is large relative to the number of samples. A recently +discovered remedy to alleviate the complexity between state variables in the +state space is to leverage the additive separability of the Hamiltonian system +and embed that additive separability into the Hamiltonian neural network. +Following the nomenclature of physics-informed machine learning, we propose +three separable Hamiltonian neural networks. These models embed additive +separability within Hamiltonian neural networks. The first model uses additive +separability to quadratically scale the amount of data for training Hamiltonian +neural networks. The second model embeds additive separability within the loss +function of the Hamiltonian neural network. The third model embeds additive +separability through the architecture of the Hamiltonian neural network using +conjoined multilayer perceptions. We empirically compare the three models +against state-of-the-art Hamiltonian neural networks, and demonstrate that the +separable Hamiltonian neural networks, which alleviate complexity between the +state variables, are more effective at regressing the Hamiltonian and its +vector field. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ SAN: Inducing Metrizability of GAN with Discriminative Normalized Linear + Layer + + +
+ Generative adversarial networks (GANs) learn a target probability +distribution by optimizing a generator and a discriminator with minimax +objectives. This paper addresses the question of whether such optimization +actually provides the generator with gradients that make its distribution close +to the target distribution. We derive metrizable conditions, sufficient +conditions for the discriminator to serve as the distance between the +distributions by connecting the GAN formulation with the concept of sliced +optimal transport. Furthermore, by leveraging these theoretical results, we +propose a novel GAN training scheme, called slicing adversarial network (SAN). +With only simple modifications, a broad class of existing GANs can be converted +to SANs. Experiments on synthetic and image datasets support our theoretical +results and the SAN's effectiveness as compared to usual GANs. Furthermore, we +also apply SAN to StyleGAN-XL, which leads to state-of-the-art FID score +amongst GANs for class conditional generation on ImageNet 256$\times$256. + +
+
+ comment: 24 pages with 13 figures +
+
+
+
+
+ + ♻ ☆ Optimal and Differentially Private Data Acquisition: Central and Local + Mechanisms + + +
+ We consider a platform's problem of collecting data from privacy sensitive +users to estimate an underlying parameter of interest. We formulate this +question as a Bayesian-optimal mechanism design problem, in which an individual +can share her (verifiable) data in exchange for a monetary reward or services, +but at the same time has a (private) heterogeneous privacy cost which we +quantify using differential privacy. We consider two popular differential +privacy settings for providing privacy guarantees for the users: central and +local. In both settings, we establish minimax lower bounds for the estimation +error and derive (near) optimal estimators for given heterogeneous privacy loss +levels for users. Building on this characterization, we pose the mechanism +design problem as the optimal selection of an estimator and payments that will +elicit truthful reporting of users' privacy sensitivities. Under a regularity +condition on the distribution of privacy sensitivities we develop efficient +algorithmic mechanisms to solve this problem in both privacy settings. Our +mechanism in the central setting can be implemented in time $\mathcal{O}(n \log +n)$ where $n$ is the number of users and our mechanism in the local setting +admits a Polynomial Time Approximation Scheme (PTAS). + +
+
+ comment: To appear in the Operations Research journal. The abstract appeared + in the Proceedings of the 23rd ACM Conference on Economics and Computation + (EC 2022) +
+
+
+
+
+ + ♻ ☆ Generative Action Description Prompts for Skeleton-based Action + Recognition ICCV23 + + +
+ Skeleton-based action recognition has recently received considerable +attention. Current approaches to skeleton-based action recognition are +typically formulated as one-hot classification tasks and do not fully exploit +the semantic relations between actions. For example, "make victory sign" and +"thumb up" are two actions of hand gestures, whose major difference lies in the +movement of hands. This information is agnostic from the categorical one-hot +encoding of action classes but could be unveiled from the action description. +Therefore, utilizing action description in training could potentially benefit +representation learning. In this work, we propose a Generative +Action-description Prompts (GAP) approach for skeleton-based action +recognition. More specifically, we employ a pre-trained large-scale language +model as the knowledge engine to automatically generate text descriptions for +body parts movements of actions, and propose a multi-modal training scheme by +utilizing the text encoder to generate feature vectors for different body parts +and supervise the skeleton encoder for action representation learning. +Experiments show that our proposed GAP method achieves noticeable improvements +over various baseline models without extra computation cost at inference. GAP +achieves new state-of-the-arts on popular skeleton-based action recognition +benchmarks, including NTU RGB+D, NTU RGB+D 120 and NW-UCLA. The source code is +available at https://github.com/MartinXM/GAP. + +
+
+ comment: Accepted by ICCV23 +
+
+
+
+
+ + ♻ ☆ Symbolically integrating tensor networks over various random tensors -- + the second version of Python RTNI + + +
+ We are upgrading the Python-version of RTNI, which symbolically integrates +tensor networks over the Haar-distributed unitary matrices. Now, PyRTNI2 can +treat the Haar-distributed orthogonal matrices and the real and complex normal +Gaussian tensors as well. Moreover, it can export tensor networks in the format +of TensorNetwork so that one can make further calculations with concrete +tensors, even for low dimensions, where the Weingarten functions differ from +the ones for high dimensions. The tutorial notebooks are found at GitHub: +https://github.com/MotohisaFukuda/PyRTNI2. In this paper, we explain maths +behind the program and show what kind of tensor network calculations can be +made with it. For the former, we interpret the element-wise moment calculus of +the above random matrices and tensors in terms of tensor network diagrams, and +argue that the view is natural, relating delta functions in the calculus to +edges in tensor network diagrams. + +
+
+ comment: PyRTNI2 is at https://github.com/MotohisaFukuda/PyRTNI2 +
+
+
+
+
+ + ♻ ☆ Explicit Second-Order Min-Max Optimization Methods with Optimal + Convergence Guarantee + + +
+ We propose and analyze exact and inexact regularized Newton-type methods for +finding a global saddle point of \emph{convex-concave} unconstrained min-max +optimization problems. Compared to first-order methods, our understanding of +second-order methods for min-max optimization is relatively limited, as +obtaining global rates of convergence with second-order information is much +more involved. In this paper, we examine how second-order information can be +used to speed up extra-gradient methods, even under inexactness. Specifically, +we show that the proposed algorithms generate iterates that remain within a +bounded set and the averaged iterates converge to an $\epsilon$-saddle point +within $O(\epsilon^{-2/3})$ iterations in terms of a restricted gap function. +Our algorithms match the theoretically established lower bound in this context +and our analysis provides a simple and intuitive convergence analysis for +second-order methods without any boundedness requirements. Finally, we present +a series of numerical experiments on synthetic and real data that demonstrate +the efficiency of the proposed algorithms. + +
+
+ comment: Improve the paper significantly; 35 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Image Labels Are All You Need for Coarse Seagrass Segmentation + + +
+ Seagrass meadows serve as critical carbon sinks, but estimating the amount of +carbon they store requires knowledge of the seagrass species present. +Underwater and surface vehicles equipped with machine learning algorithms can +help to accurately estimate the composition and extent of seagrass meadows at +scale. However, previous approaches for seagrass detection and classification +have required supervision from patch-level labels. In this paper, we reframe +seagrass classification as a weakly supervised coarse segmentation problem +where image-level labels are used during training (25 times fewer labels +compared to patch-level labeling) and patch-level outputs are obtained at +inference time. To this end, we introduce SeaFeats, an architecture that uses +unsupervised contrastive pre-training and feature similarity, and SeaCLIP, a +model that showcases the effectiveness of large language models as a +supervisory signal in domain-specific applications. We demonstrate that an +ensemble of SeaFeats and SeaCLIP leads to highly robust performance. Our method +outperforms previous approaches that require patch-level labels on the +multi-species 'DeepSeagrass' dataset by 6.8% (absolute) for the class-weighted +F1 score, and by 12.1% (absolute) for the seagrass presence/absence F1 score on +the 'Global Wetlands' dataset. We also present two case studies for real-world +deployment: outlier detection on the Global Wetlands dataset, and application +of our method on imagery collected by the FloatyBoat autonomous surface +vehicle. + +
+
+ comment: 10 pages, 4 figures, additional 3 pages of supplementary material +
+
+
+
+
+ + ♻ ☆ Explaining the Behavior of Black-Box Prediction Algorithms with Causal + Learning + + +
+ Causal approaches to post-hoc explainability for black-box prediction models +(e.g., deep neural networks trained on image pixel data) have become +increasingly popular. However, existing approaches have two important +shortcomings: (i) the "explanatory units" are micro-level inputs into the +relevant prediction model, e.g., image pixels, rather than interpretable +macro-level features that are more useful for understanding how to possibly +change the algorithm's behavior, and (ii) existing approaches assume there +exists no unmeasured confounding between features and target model predictions, +which fails to hold when the explanatory units are macro-level variables. Our +focus is on the important setting where the analyst has no access to the inner +workings of the target prediction algorithm, rather only the ability to query +the output of the model in response to a particular input. To provide causal +explanations in such a setting, we propose to learn causal graphical +representations that allow for arbitrary unmeasured confounding among features. +We demonstrate the resulting graph can differentiate between interpretable +features that causally influence model predictions versus those that are merely +associated with model predictions due to confounding. Our approach is motivated +by a counterfactual theory of causal explanation wherein good explanations +point to factors that are "difference-makers" in an interventionist sense. + +
+
+
+
+
+ + ♻ ☆ A novel physics-informed machine learning strategy to accelerate + unsteady heat and mass transfer simulations + + +
+ Despite the rapid advancements in the performance of central processing units +(CPUs), the simulation of unsteady heat and mass transfer is computationally +very costly, particularly in large domains. While a big wave of machine +learning (ML) has propagated in accelerating computational fluid dynamics (CFD) +studies, recent research has revealed that it is unrealistic to completely +suppress the error increase as the gap between the training and prediction +times increases in single training approach. In this study, we propose a +residual-based physics-informed transfer learning (RePIT) strategy to +accelerate unsteady heat and mass transfer simulations using ML-CFD cross +computation. Our hypothesis is that long-term CFD simulations become feasible +if continuous ML-CFD cross computation is periodically carried out to not only +reduce increased residuals but also update network parameters with the latest +CFD time series data (transfer learning approach). The cross point of ML-CFD is +determined using a method similar to residual monitoring methods of first +principle solvers (physics-informed manner). The feasibility of the proposed +strategy was evaluated based on natural convection simulation and compared to +the single training approach. In the single training approach, a residual scale +change occurred around 100 timesteps leading to predicted time series +exhibiting non-physical pattern as well as a large difference from the ground +truth. Conversely, it was confirmed that the RePIT strategy maintained the +continuity residual within the set range and showed good agreement with the +ground truth for all variables and locations. The simulation was accelerated by +1.9 times, including the parameter-updating time. In conclusion, this universal +strategy has the potential to significantly reduce the computational cost of +CFD simulations while maintaining high accuracy. + +
+
+ comment: 30 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ High-dimensional and Permutation Invariant Anomaly Detection + + +
+ Methods for anomaly detection of new physics processes are often limited to +low-dimensional spaces due to the difficulty of learning high-dimensional +probability densities. Particularly at the constituent level, incorporating +desirable properties such as permutation invariance and variable-length inputs +becomes difficult within popular density estimation methods. In this work, we +introduce a permutation-invariant density estimator for particle physics data +based on diffusion models, specifically designed to handle variable-length +inputs. We demonstrate the efficacy of our methodology by utilizing the learned +density as a permutation-invariant anomaly detection score, effectively +identifying jets with low likelihood under the background-only hypothesis. To +validate our density estimation method, we investigate the ratio of learned +densities and compare to those obtained by a supervised classification +algorithm. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Sample-Efficient Personalization: Modeling User Parameters as Low Rank + Plus Sparse Components + + +
+ Personalization of machine learning (ML) predictions for individual +users/domains/enterprises is critical for practical recommendation systems. +Standard personalization approaches involve learning a user/domain specific +embedding that is fed into a fixed global model which can be limiting. On the +other hand, personalizing/fine-tuning model itself for each user/domain -- +a.k.a meta-learning -- has high storage/infrastructure cost. Moreover, rigorous +theoretical studies of scalable personalization approaches have been very +limited. To address the above issues, we propose a novel meta-learning style +approach that models network weights as a sum of low-rank and sparse +components. This captures common information from multiple individuals/users +together in the low-rank part while sparse part captures user-specific +idiosyncrasies. We then study the framework in the linear setting, where the +problem reduces to that of estimating the sum of a rank-$r$ and a $k$-column +sparse matrix using a small number of linear measurements. We propose a +computationally efficient alternating minimization method with iterative hard +thresholding -- AMHT-LRS -- to learn the low-rank and sparse part. +Theoretically, for the realizable Gaussian data setting, we show that AMHT-LRS +solves the problem efficiently with nearly optimal sample complexity. Finally, +a significant challenge in personalization is ensuring privacy of each user's +sensitive data. We alleviate this problem by proposing a differentially private +variant of our method that also is equipped with strong generalization +guarantees. + +
+
+ comment: 104 pages, 7 figures, 2 Tables +
+
+
+
+
+ + ♻ ☆ DeepAD: A Robust Deep Learning Model of Alzheimer's Disease Progression + for Real-World Clinical Applications + + +
+ The ability to predict the future trajectory of a patient is a key step +toward the development of therapeutics for complex diseases such as Alzheimer's +disease (AD). However, most machine learning approaches developed for +prediction of disease progression are either single-task or single-modality +models, which can not be directly adopted to our setting involving multi-task +learning with high dimensional images. Moreover, most of those approaches are +trained on a single dataset (i.e. cohort), which can not be generalized to +other cohorts. We propose a novel multimodal multi-task deep learning model to +predict AD progression by analyzing longitudinal clinical and neuroimaging data +from multiple cohorts. Our proposed model integrates high dimensional MRI +features from a 3D convolutional neural network with other data modalities, +including clinical and demographic information, to predict the future +trajectory of patients. Our model employs an adversarial loss to alleviate the +study-specific imaging bias, in particular the inter-study domain shifts. In +addition, a Sharpness-Aware Minimization (SAM) optimization technique is +applied to further improve model generalization. The proposed model is trained +and tested on various datasets in order to evaluate and validate the results. +Our results showed that 1) our model yields significant improvement over the +baseline models, and 2) models using extracted neuroimaging features from 3D +convolutional neural network outperform the same models when applied to +MRI-derived volumetric features. + +
+
+
+
+
+ + ♻ ☆ Continual Pre-Training of Large Language Models: How to (re)warm your + model? + + +
+ Large language models (LLMs) are routinely pre-trained on billions of tokens, +only to restart the process over again once new data becomes available. A much +cheaper and more efficient solution would be to enable the continual +pre-training of these models, i.e. updating pre-trained models with new data +instead of re-training them from scratch. However, the distribution shift +induced by novel data typically results in degraded performance on past data. +Taking a step towards efficient continual pre-training, in this work, we +examine the effect of different warm-up strategies. Our hypothesis is that the +learning rate must be re-increased to improve compute efficiency when training +on a new dataset. We study the warmup phase of models pre-trained on the Pile +(upstream data, 300B tokens) as we continue to pre-train on SlimPajama +(downstream data, 297B tokens), following a linear warmup and cosine decay +schedule. We conduct all experiments on the Pythia 410M language model +architecture and evaluate performance through validation perplexity. We +experiment with different pre-training checkpoints, various maximum learning +rates, and various warmup lengths. Our results show that while rewarming models +first increases the loss on upstream and downstream data, in the longer run it +improves the downstream performance, outperforming models trained from +scratch$\unicode{x2013}$even for a large downstream dataset. + +
+
+
+
+
+ + ♻ ☆ Comparing Sequential Forecasters + + +
+ Consider two forecasters, each making a single prediction for a sequence of +events over time. We ask a relatively basic question: how might we compare +these forecasters, either online or post-hoc, while avoiding unverifiable +assumptions on how the forecasts and outcomes were generated? In this paper, we +present a rigorous answer to this question by designing novel sequential +inference procedures for estimating the time-varying difference in forecast +scores. To do this, we employ confidence sequences (CS), which are sequences of +confidence intervals that can be continuously monitored and are valid at +arbitrary data-dependent stopping times ("anytime-valid"). The widths of our +CSs are adaptive to the underlying variance of the score differences. +Underlying their construction is a game-theoretic statistical framework, in +which we further identify e-processes and p-processes for sequentially testing +a weak null hypothesis -- whether one forecaster outperforms another on average +(rather than always). Our methods do not make distributional assumptions on the +forecasts or outcomes; our main theorems apply to any bounded scores, and we +later provide alternative methods for unbounded scores. We empirically validate +our approaches by comparing real-world baseball and weather forecasters. + +
+
+ comment: Accepted to Operations Research. Code and data sources available at + https://github.com/yjchoe/ComparingForecasters +
+
+
+
+
+ + ♻ ☆ ClimSim: An open large-scale dataset for training high-resolution + physics emulators in hybrid multi-scale climate simulators + + +
+ Modern climate projections lack adequate spatial and temporal resolution due +to computational constraints. A consequence is inaccurate and imprecise +predictions of critical processes such as storms. Hybrid methods that combine +physics with machine learning (ML) have introduced a new generation of higher +fidelity climate simulators that can sidestep Moore's Law by outsourcing +compute-hungry, short, high-resolution simulations to ML emulators. However, +this hybrid ML-physics simulation approach requires domain-specific treatment +and has been inaccessible to ML experts because of lack of training data and +relevant, easy-to-use workflows. We present ClimSim, the largest-ever dataset +designed for hybrid ML-physics research. It comprises multi-scale climate +simulations, developed by a consortium of climate scientists and ML +researchers. It consists of 5.7 billion pairs of multivariate input and output +vectors that isolate the influence of locally-nested, high-resolution, +high-fidelity physics on a host climate simulator's macro-scale physical state. + The dataset is global in coverage, spans multiple years at high sampling +frequency, and is designed such that resulting emulators are compatible with +downstream coupling into operational climate simulators. We implement a range +of deterministic and stochastic regression baselines to highlight the ML +challenges and their scoring. The data +(https://huggingface.co/datasets/LEAP/ClimSim_high-res, +https://huggingface.co/datasets/LEAP/ClimSim_low-res, and +https://huggingface.co/datasets/LEAP/ClimSim_low-res_aqua-planet) and code +(https://leap-stc.github.io/ClimSim) are released openly to support the +development of hybrid ML-physics and high-fidelity climate simulations for the +benefit of science and society. + +
+
+
+
+
+ + ♻ ☆ Knowledge Graphs in Practice: Characterizing their Users, Challenges, + and Visualization Opportunities + + +
+ This study presents insights from interviews with nineteen Knowledge Graph +(KG) practitioners who work in both enterprise and academic settings on a wide +variety of use cases. Through this study, we identify critical challenges +experienced by KG practitioners when creating, exploring, and analyzing KGs +that could be alleviated through visualization design. Our findings reveal +three major personas among KG practitioners - KG Builders, Analysts, and +Consumers - each of whom have their own distinct expertise and needs. We +discover that KG Builders would benefit from schema enforcers, while KG +Analysts need customizable query builders that provide interim query results. +For KG Consumers, we identify a lack of efficacy for node-link diagrams, and +the need for tailored domain-specific visualizations to promote KG adoption and +comprehension. Lastly, we find that implementing KGs effectively in practice +requires both technical and social solutions that are not addressed with +current tools, technologies, and collaborative workflows. From the analysis of +our interviews, we distill several visualization research directions to improve +KG usability, including knowledge cards that balance digestibility and +discoverability, timeline views to track temporal changes, interfaces that +support organic discovery, and semantic explanations for AI and machine +learning predictions. + +
+
+
+
+
+ + ♻ ☆ How to select an objective function using information theory + + +
+ In machine learning or scientific computing, model performance is measured +with an objective function. But why choose one objective over another? +Information theory gives one answer: To maximize the information in the model, +select the most likely objective function or whichever represents the error in +the fewest bits. To evaluate different objectives, transform them into +likelihood functions. As likelihoods, their relative magnitudes represent how +much we should prefer one objective versus another, and the log of their +magnitude represents the expected uncertainty of the model. + +
+
+ comment: 8 pages, 1 figure, 1 table +
+
+
+
+
+ + ♻ ☆ BoXHED2.0: Scalable boosting of dynamic survival analysis + + +
+ Modern applications of survival analysis increasingly involve time-dependent +covariates. The Python package BoXHED2.0 is a tree-boosted hazard estimator +that is fully nonparametric, and is applicable to survival settings far more +general than right-censoring, including recurring events and competing risks. +BoXHED2.0 is also scalable to the point of being on the same order of speed as +parametric boosted survival models, in part because its core is written in C++ +and it also supports the use of GPUs and multicore CPUs. BoXHED2.0 is available +from PyPI and also from www.github.com/BoXHED. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ♻ ☆ Models of human preference for learning reward functions + + +
+ The utility of reinforcement learning is limited by the alignment of reward +functions with the interests of human stakeholders. One promising method for +alignment is to learn the reward function from human-generated preferences +between pairs of trajectory segments, a type of reinforcement learning from +human feedback (RLHF). These human preferences are typically assumed to be +informed solely by partial return, the sum of rewards along each segment. We +find this assumption to be flawed and propose modeling human preferences +instead as informed by each segment's regret, a measure of a segment's +deviation from optimal decision-making. Given infinitely many preferences +generated according to regret, we prove that we can identify a reward function +equivalent to the reward function that generated those preferences, and we +prove that the previous partial return model lacks this identifiability +property in multiple contexts. We empirically show that our proposed regret +preference model outperforms the partial return preference model with finite +training data in otherwise the same setting. Additionally, we find that our +proposed regret preference model better predicts real human preferences and +also learns reward functions from these preferences that lead to policies that +are better human-aligned. Overall, this work establishes that the choice of +preference model is impactful, and our proposed regret preference model +provides an improvement upon a core assumption of recent research. We have open +sourced our experimental code, the human preferences dataset we gathered, and +our training and preference elicitation interfaces for gathering a such a +dataset. + +
+
+ comment: 16 pages (40 pages with references and appendix), 23 figures +
+
+
+
+
+ + ♻ ☆ Primal-Dual Contextual Bayesian Optimization for Control System Online + Optimization with Time-Average Constraints + + +
+ This paper studies the problem of online performance optimization of +constrained closed-loop control systems, where both the objective and the +constraints are unknown black-box functions affected by exogenous time-varying +contextual disturbances. A primal-dual contextual Bayesian optimization +algorithm is proposed that achieves sublinear cumulative regret with respect to +the dynamic optimal solution under certain regularity conditions. Furthermore, +the algorithm achieves zero time-average constraint violation, ensuring that +the average value of the constraint function satisfies the desired constraint. +The method is applied to both sampled instances from Gaussian processes and a +continuous stirred tank reactor parameter tuning problem; simulation results +show that the method simultaneously provides close-to-optimal performance and +maintains constraint feasibility on average. This contrasts current +state-of-the-art methods, which either suffer from large cumulative regret or +severe constraint violations for the case studies presented. + +
+
+
+
+
+ + ♻ ☆ Kernelized Concept Erasure EMNLP22 + + +
+ The representation space of neural models for textual data emerges in an +unsupervised manner during training. Understanding how those representations +encode human-interpretable concepts is a fundamental problem. One prominent +approach for the identification of concepts in neural representations is +searching for a linear subspace whose erasure prevents the prediction of the +concept from the representations. However, while many linear erasure algorithms +are tractable and interpretable, neural networks do not necessarily represent +concepts in a linear manner. To identify non-linearly encoded concepts, we +propose a kernelization of a linear minimax game for concept erasure. We +demonstrate that it is possible to prevent specific non-linear adversaries from +predicting the concept. However, the protection does not transfer to different +nonlinear adversaries. Therefore, exhaustively erasing a non-linearly encoded +concept remains an open problem. + +
+
+ comment: Accepted as a long paper in EMNLP22 +
+
+
+
+
+ + ♻ ☆ Towards Personalized Federated Learning via Heterogeneous Model + Reassembly + + +
+ This paper focuses on addressing the practical yet challenging problem of +model heterogeneity in federated learning, where clients possess models with +different network structures. To track this problem, we propose a novel +framework called pFedHR, which leverages heterogeneous model reassembly to +achieve personalized federated learning. In particular, we approach the problem +of heterogeneous model personalization as a model-matching optimization task on +the server side. Moreover, pFedHR automatically and dynamically generates +informative and diverse personalized candidates with minimal human +intervention. Furthermore, our proposed heterogeneous model reassembly +technique mitigates the adverse impact introduced by using public data with +different distributions from the client data to a certain extent. Experimental +results demonstrate that pFedHR outperforms baselines on three datasets under +both IID and Non-IID settings. Additionally, pFedHR effectively reduces the +adverse impact of using different public data and dynamically generates diverse +personalized models in an automated manner. + +
+
+
+
+
+ + ♻ ☆ The Space of Adversarial Strategies USENIX Security + + +
+ Adversarial examples, inputs designed to induce worst-case behavior in +machine learning models, have been extensively studied over the past decade. +Yet, our understanding of this phenomenon stems from a rather fragmented pool +of knowledge; at present, there are a handful of attacks, each with disparate +assumptions in threat models and incomparable definitions of optimality. In +this paper, we propose a systematic approach to characterize worst-case (i.e., +optimal) adversaries. We first introduce an extensible decomposition of attacks +in adversarial machine learning by atomizing attack components into surfaces +and travelers. With our decomposition, we enumerate over components to create +576 attacks (568 of which were previously unexplored). Next, we propose the +Pareto Ensemble Attack (PEA): a theoretical attack that upper-bounds attack +performance. With our new attacks, we measure performance relative to the PEA +on: both robust and non-robust models, seven datasets, and three extended +lp-based threat models incorporating compute costs, formalizing the Space of +Adversarial Strategies. From our evaluation we find that attack performance to +be highly contextual: the domain, model robustness, and threat model can have a +profound influence on attack efficacy. Our investigation suggests that future +studies measuring the security of machine learning should: (1) be +contextualized to the domain & threat models, and (2) go beyond the handful of +known attacks used today. + +
+
+ comment: Accepted to the 32nd USENIX Security Symposium +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ FArMARe: a Furniture-Aware Multi-task methodology for Recommending + Apartments based on the user interests ICCV2023 + + +
+ Nowadays, many people frequently have to search for new accommodation +options. Searching for a suitable apartment is a time-consuming process, +especially because visiting them is often mandatory to assess the truthfulness +of the advertisements found on the Web. While this process could be alleviated +by visiting the apartments in the metaverse, the Web-based recommendation +platforms are not suitable for the task. To address this shortcoming, in this +paper, we define a new problem called text-to-apartment recommendation, which +requires ranking the apartments based on their relevance to a textual query +expressing the user's interests. To tackle this problem, we introduce FArMARe, +a multi-task approach that supports cross-modal contrastive training with a +furniture-aware objective. Since public datasets related to indoor scenes do +not contain detailed descriptions of the furniture, we collect and annotate a +dataset comprising more than 6000 apartments. A thorough experimentation with +three different methods and two raw feature extraction procedures reveals the +effectiveness of FArMARe in dealing with the problem at hand. + +
+
+ comment: accepted for presentation at the ICCV2023 CV4Metaverse workshop +
+
+
+
+
+ + ☆ Disarming Steganography Attacks Inside Neural Network Models + + +
+ Similar to the revolution of open source code sharing, Artificial +Intelligence (AI) model sharing is gaining increased popularity. However, the +fast adaptation in the industry, lack of awareness, and ability to exploit the +models make them significant attack vectors. By embedding malware in neurons, +the malware can be delivered covertly, with minor or no impact on the neural +network's performance. The covert attack will use the Least Significant Bits +(LSB) weight attack since LSB has a minimal effect on the model accuracy, and +as a result, the user will not notice it. Since there are endless ways to hide +the attacks, we focus on a zero-trust prevention strategy based on AI model +attack disarm and reconstruction. We proposed three types of model +steganography weight disarm defense mechanisms. The first two are based on +random bit substitution noise, and the other on model weight quantization. We +demonstrate a 100\% prevention rate while the methods introduce a minimal +decrease in model accuracy based on Qint8 and K-LRBP methods, which is an +essential factor for improving AI security. + +
+
+
+
+
+ + ☆ Parameter Efficient Audio Captioning With Faithful Guidance Using + Audio-text Shared Latent Representation + + +
+ There has been significant research on developing pretrained transformer +architectures for multimodal-to-text generation tasks. Albeit performance +improvements, such models are frequently overparameterized, hence suffer from +hallucination and large memory footprint making them challenging to deploy on +edge devices. In this paper, we address both these issues for the application +of automated audio captioning. First, we propose a data augmentation technique +for generating hallucinated audio captions and show that similarity based on an +audio-text shared latent space is suitable for detecting hallucination. Then, +we propose a parameter efficient inference time faithful decoding algorithm +that enables smaller audio captioning models with performance equivalent to +larger models trained with more data. During the beam decoding step, the +smaller model utilizes an audio-text shared latent representation to +semantically align the generated text with corresponding input audio. Faithful +guidance is introduced into the beam probability by incorporating the cosine +similarity between latent representation projections of greedy rolled out +intermediate beams and audio clip. We show the efficacy of our algorithm on +benchmark datasets and evaluate the proposed scheme against baselines using +conventional audio captioning and semantic similarity metrics while +illustrating tradeoffs between performance and complexity. + +
+
+ comment: 5 pages, 5 tables, 1 figure +
+
+
+
+
+ + ☆ Detecting False Alarms and Misses in Audio Captions + + +
+ Metrics to evaluate audio captions simply provide a score without much +explanation regarding what may be wrong in case the score is low. Manual human +intervention is needed to find any shortcomings of the caption. In this work, +we introduce a metric which automatically identifies the shortcomings of an +audio caption by detecting the misses and false alarms in a candidate caption +with respect to a reference caption, and reports the recall, precision and +F-score. Such a metric is very useful in profiling the deficiencies of an audio +captioning model, which is a milestone towards improving the quality of audio +captions. + +
+
+
+
+
+ + ♻ ☆ Generative Steganography Diffusion + + +
+ Generative steganography (GS) is an emerging technique that generates stego +images directly from secret data. Various GS methods based on GANs or Flow have +been developed recently. However, existing GAN-based GS methods cannot +completely recover the hidden secret data due to the lack of network +invertibility, while Flow-based methods produce poor image quality due to the +stringent reversibility restriction in each module. To address this issue, we +propose a novel GS scheme called "Generative Steganography Diffusion" (GSD) by +devising an invertible diffusion model named "StegoDiffusion". It not only +generates realistic stego images but also allows for 100\% recovery of the +hidden secret data. The proposed StegoDiffusion model leverages a non-Markov +chain with a fast sampling technique to achieve efficient stego image +generation. By constructing an ordinary differential equation (ODE) based on +the transition probability of the generation process in StegoDiffusion, secret +data and stego images can be converted to each other through the approximate +solver of ODE -- Euler iteration formula, enabling the use of irreversible but +more expressive network structures to achieve model invertibility. Our proposed +GSD has the advantages of both reversibility and high performance, +significantly outperforming existing GS methods in all metrics. + +
+
+ comment: Shall not be reproduced without permission, rights reserved! +
+
+
+
+
+ + ♻ ☆ Extraction of Visual Information to Predict Crowdfunding Success + + +
+ Researchers have increasingly turned to crowdfunding platforms to gain +insights into entrepreneurial activity and dynamics. While previous studies +have explored various factors influencing crowdfunding success, such as +technology, communication, and marketing strategies, the role of visual +elements that can be automatically extracted from images has received less +attention. This is surprising, considering that crowdfunding platforms +emphasize the importance of attention-grabbing and high-resolution images, and +previous research has shown that image characteristics can significantly impact +product evaluations. Indeed, a comprehensive review of empirical articles (n = +202) that utilized Kickstarter data, focusing on the incorporation of visual +information in their analyses. Our findings reveal that only 29.70% controlled +for the number of images, and less than 12% considered any image details. In +this manuscript, we review the literature on image processing and its relevance +to the business domain, highlighting two types of visual variables: visual +counts (number of pictures and number of videos) and image details. Building +upon previous work that discussed the role of color, composition and +figure-ground relationships, we introduce visual scene elements that have not +yet been explored in crowdfunding, including the number of faces, the number of +concepts depicted, and the ease of identifying those concepts. To demonstrate +the predictive value of visual counts and image details, we analyze Kickstarter +data. Our results highlight that visual count features are two of the top three +predictors of success. Our results also show that simple image detail features +such as color matter a lot, and our proposed measures of visual scene elements +can also be useful. We supplement our article with R and Python codes that help +authors extract image details (https://osf.io/ujnzp/). + +
+
+ comment: 32 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Ducho: A Unified Framework for the Extraction of Multimodal Features in + Recommendation + + +
+ In multimodal-aware recommendation, the extraction of meaningful multimodal +features is at the basis of high-quality recommendations. Generally, each +recommendation framework implements its multimodal extraction procedures with +specific strategies and tools. This is limiting for two reasons: (i) different +extraction strategies do not ease the interdependence among multimodal +recommendation frameworks; thus, they cannot be efficiently and fairly +compared; (ii) given the large plethora of pre-trained deep learning models +made available by different open source tools, model designers do not have +access to shared interfaces to extract features. Motivated by the outlined +aspects, we propose \framework, a unified framework for the extraction of +multimodal features in recommendation. By integrating three widely-adopted deep +learning libraries as backends, namely, TensorFlow, PyTorch, and Transformers, +we provide a shared interface to extract and process features where each +backend's specific methods are abstracted to the end user. Noteworthy, the +extraction pipeline is easily configurable with a YAML-based file where the +user can specify, for each modality, the list of models (and their specific +backends/parameters) to perform the extraction. Finally, to make \framework +accessible to the community, we build a public Docker image equipped with a +ready-to-use CUDA environment and propose three demos to test its +functionalities for different scenarios and tasks. The GitHub repository and +the documentation are accessible at this link: +https://github.com/sisinflab/Ducho. + +
+
+
+
+
+ + ♻ ☆ Context-Aware Talking-Head Video Editing + + +
+ Talking-head video editing aims to efficiently insert, delete, and substitute +the word of a pre-recorded video through a text transcript editor. The key +challenge for this task is obtaining an editing model that generates new +talking-head video clips which simultaneously have accurate lip synchronization +and motion smoothness. Previous approaches, including 3DMM-based (3D Morphable +Model) methods and NeRF-based (Neural Radiance Field) methods, are sub-optimal +in that they either require minutes of source videos and days of training time +or lack the disentangled control of verbal (e.g., lip motion) and non-verbal +(e.g., head pose and expression) representations for video clip insertion. In +this work, we fully utilize the video context to design a novel framework for +talking-head video editing, which achieves efficiency, disentangled motion +control, and sequential smoothness. Specifically, we decompose this framework +to motion prediction and motion-conditioned rendering: (1) We first design an +animation prediction module that efficiently obtains smooth and lip-sync motion +sequences conditioned on the driven speech. This module adopts a +non-autoregressive network to obtain context prior and improve the prediction +efficiency, and it learns a speech-animation mapping prior with better +generalization to novel speech from a multi-identity video dataset. (2) We then +introduce a neural rendering module to synthesize the photo-realistic and +full-head video frames given the predicted motion sequence. This module adopts +a pre-trained head topology and uses only few frames for efficient fine-tuning +to obtain a person-specific rendering model. Extensive experiments demonstrate +that our method efficiently achieves smoother editing results with higher image +quality and lip accuracy using less data than previous methods. + +
+
+ comment: needs some improvements +
+
+
+
+
+ + ♻ ☆ Generative Action Description Prompts for Skeleton-based Action + Recognition ICCV23 + + +
+ Skeleton-based action recognition has recently received considerable +attention. Current approaches to skeleton-based action recognition are +typically formulated as one-hot classification tasks and do not fully exploit +the semantic relations between actions. For example, "make victory sign" and +"thumb up" are two actions of hand gestures, whose major difference lies in the +movement of hands. This information is agnostic from the categorical one-hot +encoding of action classes but could be unveiled from the action description. +Therefore, utilizing action description in training could potentially benefit +representation learning. In this work, we propose a Generative +Action-description Prompts (GAP) approach for skeleton-based action +recognition. More specifically, we employ a pre-trained large-scale language +model as the knowledge engine to automatically generate text descriptions for +body parts movements of actions, and propose a multi-modal training scheme by +utilizing the text encoder to generate feature vectors for different body parts +and supervise the skeleton encoder for action representation learning. +Experiments show that our proposed GAP method achieves noticeable improvements +over various baseline models without extra computation cost at inference. GAP +achieves new state-of-the-arts on popular skeleton-based action recognition +benchmarks, including NTU RGB+D, NTU RGB+D 120 and NW-UCLA. The source code is +available at https://github.com/MartinXM/GAP. + +
+
+ comment: Accepted by ICCV23 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 40 + +
+
+
+ + ☆ Cognitive Architectures for Language Agents + + +
+ Recent efforts have incorporated large language models (LLMs) with external +resources (e.g., the Internet) or internal control flows (e.g., prompt +chaining) for tasks requiring grounding or reasoning. However, these efforts +have largely been piecemeal, lacking a systematic framework for constructing a +fully-fledged language agent. To address this challenge, we draw on the rich +history of agent design in symbolic artificial intelligence to develop a +blueprint for a new wave of cognitive language agents. We first show that LLMs +have many of the same properties as production systems, and recent efforts to +improve their grounding or reasoning mirror the development of cognitive +architectures built around production systems. We then propose Cognitive +Architectures for Language Agents (CoALA), a conceptual framework to +systematize diverse methods for LLM-based reasoning, grounding, learning, and +decision making as instantiations of language agents in the framework. Finally, +we use the CoALA framework to highlight gaps and propose actionable directions +toward more capable language agents in the future. + +
+
+ comment: 16 pages of main content, 10 pages of references, 5 figures. Equal + contribution among the first two authors, order decided by coin flip. A + CoALA-based repo of recent work on language agents: + https://github.com/ysymyth/awesome-language-agents +
+
+
+
+
+ + ☆ Substitution-based Semantic Change Detection using Contextual Embeddings + + +
+ Measuring semantic change has thus far remained a task where methods using +contextual embeddings have struggled to improve upon simpler techniques relying +only on static word vectors. Moreover, many of the previously proposed +approaches suffer from downsides related to scalability and ease of +interpretation. We present a simplified approach to measuring semantic change +using contextual embeddings, relying only on the most probable substitutes for +masked terms. Not only is this approach directly interpretable, it is also far +more efficient in terms of storage, achieves superior average performance +across the most frequently cited datasets for this task, and allows for more +nuanced investigation of change than is possible with static word vectors. + +
+
+
+
+
+ + ☆ nanoT5: A PyTorch Framework for Pre-training and Fine-tuning T5-style + Models with Limited Resources + + +
+ State-of-the-art language models like T5 have revolutionized the NLP +landscape, but their computational demands hinder a large portion of the +research community. To address this challenge, we present nanoT5, a +specially-optimized PyTorch framework for efficient pre-training and +fine-tuning of T5 models. Drawing on insights from optimizer differences and +prioritizing efficiency, nanoT5 allows a T5-Base model to be pre-trained on a +single GPU in just 16 hours, without any loss in performance. With the +introduction of this open-source framework, we hope to widen the accessibility +to language modelling research and cater to the community's demand for more +user-friendly T5 (Encoder-Decoder) implementations. Our contributions, +including configurations, codebase, software/hardware insights, and pre-trained +models, are available to the public, aiming to strike a balance between +research accessibility and resource constraints in NLP. + +
+
+
+
+
+ + ☆ Weigh Your Own Words: Improving Hate Speech Counter Narrative Generation + via Attention Regularization + + +
+ Recent computational approaches for combating online hate speech involve the +automatic generation of counter narratives by adapting Pretrained +Transformer-based Language Models (PLMs) with human-curated data. This process, +however, can produce in-domain overfitting, resulting in models generating +acceptable narratives only for hatred similar to training data, with little +portability to other targets or to real-world toxic language. This paper +introduces novel attention regularization methodologies to improve the +generalization capabilities of PLMs for counter narratives generation. +Overfitting to training-specific terms is then discouraged, resulting in more +diverse and richer narratives. We experiment with two attention-based +regularization techniques on a benchmark English dataset. Regularized models +produce better counter narratives than state-of-the-art approaches in most +cases, both in terms of automatic metrics and human evaluation, especially when +hateful targets are not present in the training data. This work paves the way +for better and more flexible counter-speech generation models, a task for which +datasets are highly challenging to produce. + +
+
+ comment: To appear at CS4OA workshop (INLG-SIGDial) +
+
+
+
+
+ + ☆ PromptTTS 2: Describing and Generating Voices with Text Prompt + + +
+ Speech conveys more information than just text, as the same word can be +uttered in various voices to convey diverse information. Compared to +traditional text-to-speech (TTS) methods relying on speech prompts (reference +speech) for voice variability, using text prompts (descriptions) is more +user-friendly since speech prompts can be hard to find or may not exist at all. +TTS approaches based on the text prompt face two challenges: 1) the one-to-many +problem, where not all details about voice variability can be described in the +text prompt, and 2) the limited availability of text prompt datasets, where +vendors and large cost of data labeling are required to write text prompt for +speech. In this work, we introduce PromptTTS 2 to address these challenges with +a variation network to provide variability information of voice not captured by +text prompts, and a prompt generation pipeline to utilize the large language +models (LLM) to compose high quality text prompts. Specifically, the variation +network predicts the representation extracted from the reference speech (which +contains full information about voice) based on the text prompt representation. +For the prompt generation pipeline, it generates text prompts for speech with a +speech understanding model to recognize voice attributes (e.g., gender, speed) +from speech and a large language model to formulate text prompt based on the +recognition results. Experiments on a large-scale (44K hours) speech dataset +demonstrate that compared to the previous works, PromptTTS 2 generates voices +more consistent with text prompts and supports the sampling of diverse voice +variability, thereby offering users more choices on voice generation. +Additionally, the prompt generation pipeline produces high-quality prompts, +eliminating the large labeling cost. The demo page of PromptTTS 2 is available +online\footnote{https://speechresearch.github.io/prompttts2}. + +
+
+ comment: Demo page: https://speechresearch.github.io/prompttts2 +
+
+
+
+
+ + ☆ Dialog Action-Aware Transformer for Dialog Policy Learning + + +
+ Recent works usually address Dialog policy learning DPL by training a +reinforcement learning (RL) agent to determine the best dialog action. However, +existing works on deep RL require a large volume of agent-user interactions to +achieve acceptable performance. In this paper, we propose to make full use of +the plain text knowledge from the pre-trained language model to accelerate the +RL agent's learning speed. Specifically, we design a dialog action-aware +transformer encoder (DaTrans), which integrates a new fine-tuning procedure +named masked last action task to encourage DaTrans to be dialog-aware and +distils action-specific features. Then, DaTrans is further optimized in an RL +setting with ongoing interactions and evolves through exploration in the dialog +action space toward maximizing long-term accumulated rewards. The effectiveness +and efficiency of the proposed model are demonstrated with both simulator +evaluation and human evaluation. + +
+
+ comment: To be appeared in SIGdial 2023 +
+
+
+
+
+ + ☆ Augmenting Black-box LLMs with Medical Textbooks for Clinical Question + Answering + + +
+ Large-scale language models (LLMs), such as ChatGPT, are capable of +generating human-like responses for various downstream tasks, such as +task-oriented dialogues and question answering. However, applying LLMs to +medical domains remains challenging due to their inability to leverage +domain-specific knowledge. In this study, we present the Large-scale Language +Models Augmented with Medical Textbooks (LLM-AMT), which integrates +authoritative medical textbooks as the cornerstone of its design, enhancing its +proficiency in the specialized domain through plug-and-play modules, comprised +of a Hybrid Textbook Retriever, supplemented by the Query Augmenter and the LLM +Reader. Experimental evaluation on three open-domain medical question-answering +tasks reveals a substantial enhancement in both the professionalism and +accuracy of the LLM responses when utilizing LLM-AMT, exhibiting an improvement +ranging from 11.4% to 13.2%. Despite being 100 times smaller, we found that +medical textbooks as the retrieval corpus serves as a more valuable external +knowledge source than Wikipedia in the medical domain. Our experiments show +that textbook augmentation results in a performance improvement ranging from +9.7% to 12.2% over Wikipedia augmentation. + +
+
+
+
+
+ + ☆ Leveraging BERT Language Models for Multi-Lingual ESG Issue + Identification + + +
+ Environmental, Social, and Governance (ESG) has been used as a metric to +measure the negative impacts and enhance positive outcomes of companies in +areas such as the environment, society, and governance. Recently, investors +have increasingly recognized the significance of ESG criteria in their +investment choices, leading businesses to integrate ESG principles into their +operations and strategies. The Multi-Lingual ESG Issue Identification (ML-ESG) +shared task encompasses the classification of news documents into 35 distinct +ESG issue labels. In this study, we explored multiple strategies harnessing +BERT language models to achieve accurate classification of news documents +across these labels. Our analysis revealed that the RoBERTa classifier emerged +as one of the most successful approaches, securing the second-place position +for the English test dataset, and sharing the fifth-place position for the +French test dataset. Furthermore, our SVM-based binary model tailored for the +Chinese language exhibited exceptional performance, earning the second-place +rank on the test dataset. + +
+
+
+
+
+ + ☆ Incorporating Dictionaries into a Neural Network Architecture to Extract + COVID-19 Medical Concepts From Social Media + + +
+ We investigate the potential benefit of incorporating dictionary information +into a neural network architecture for natural language processing. In +particular, we make use of this architecture to extract several concepts +related to COVID-19 from an on-line medical forum. We use a sample from the +forum to manually curate one dictionary for each concept. In addition, we use +MetaMap, which is a tool for extracting biomedical concepts, to identify a +small number of semantic concepts. For a supervised concept extraction task on +the forum data, our best model achieved a macro $F_1$ score of 90\%. A major +difficulty in medical concept extraction is obtaining labelled data from which +to build supervised models. We investigate the utility of our models to +transfer to data derived from a different source in two ways. First for +producing labels via weak learning and second to perform concept extraction. +The dataset we use in this case comprises COVID-19 related tweets and we +achieve an $F_1$ score 81\% for symptom concept extraction trained on weakly +labelled data. The utility of our dictionaries is compared with a COVID-19 +symptom dictionary that was constructed directly from Twitter. Further +experiments that incorporate BERT and a COVID-19 version of BERTweet +demonstrate that the dictionaries provide a commensurate result. Our results +show that incorporating small domain dictionaries to deep learning models can +improve concept extraction tasks. Moreover, models built using dictionaries +generalize well and are transferable to different datasets on a similar task. + +
+
+
+
+
+ + ☆ Advancing Text-to-GLOSS Neural Translation Using a Novel Hyper-parameter + Optimization Technique + + +
+ In this paper, we investigate the use of transformers for Neural Machine +Translation of text-to-GLOSS for Deaf and Hard-of-Hearing communication. Due to +the scarcity of available data and limited resources for text-to-GLOSS +translation, we treat the problem as a low-resource language task. We use our +novel hyper-parameter exploration technique to explore a variety of +architectural parameters and build an optimal transformer-based architecture +specifically tailored for text-to-GLOSS translation. The study aims to improve +the accuracy and fluency of Neural Machine Translation generated GLOSS. This is +achieved by examining various architectural parameters including layer count, +attention heads, embedding dimension, dropout, and label smoothing to identify +the optimal architecture for improving text-to-GLOSS translation performance. +The experiments conducted on the PHOENIX14T dataset reveal that the optimal +transformer architecture outperforms previous work on the same dataset. The +best model reaches a ROUGE (Recall-Oriented Understudy for Gisting Evaluation) +score of 55.18% and a BLEU-1 (BiLingual Evaluation Understudy 1) score of +63.6%, outperforming state-of-the-art results on the BLEU1 and ROUGE score by +8.42 and 0.63 respectively. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Bring the Noise: Introducing Noise Robustness to Pretrained Automatic + Speech Recognition ICANN 2023 + + +
+ In recent research, in the domain of speech processing, large End-to-End +(E2E) systems for Automatic Speech Recognition (ASR) have reported +state-of-the-art performance on various benchmarks. These systems intrinsically +learn how to handle and remove noise conditions from speech. Previous research +has shown, that it is possible to extract the denoising capabilities of these +models into a preprocessor network, which can be used as a frontend for +downstream ASR models. However, the proposed methods were limited to specific +fully convolutional architectures. In this work, we propose a novel method to +extract the denoising capabilities, that can be applied to any encoder-decoder +architecture. We propose the Cleancoder preprocessor architecture that extracts +hidden activations from the Conformer ASR model and feeds them to a decoder to +predict denoised spectrograms. We train our pre-processor on the Noisy Speech +Database (NSD) to reconstruct denoised spectrograms from noisy inputs. Then, we +evaluate our model as a frontend to a pretrained Conformer ASR model as well as +a frontend to train smaller Conformer ASR models from scratch. We show that the +Cleancoder is able to filter noise from speech and that it improves the total +Word Error Rate (WER) of the downstream model in noisy conditions for both +applications. + +
+
+ comment: Submitted and accepted for ICANN 2023 (32nd International Conference + on Artificial Neural Networks) +
+
+
+
+
+ + ☆ Making Large Language Models Better Reasoners with Alignment + + +
+ Reasoning is a cognitive process of using evidence to reach a sound +conclusion. The reasoning capability is essential for large language models +(LLMs) to serve as the brain of the artificial general intelligence agent. +Recent studies reveal that fine-tuning LLMs on data with the chain of thought +(COT) reasoning process can significantly enhance their reasoning capabilities. +However, we find that the fine-tuned LLMs suffer from an \textit{Assessment +Misalignment} problem, i.e., they frequently assign higher scores to subpar +COTs, leading to potential limitations in their reasoning abilities. To address +this problem, we introduce an \textit{Alignment Fine-Tuning (AFT)} paradigm, +which involves three steps: 1) fine-tuning LLMs with COT training data; 2) +generating multiple COT responses for each question, and categorizing them into +positive and negative ones based on whether they achieve the correct answer; 3) +calibrating the scores of positive and negative responses given by LLMs with a +novel constraint alignment loss. Specifically, the constraint alignment loss +has two objectives: a) Alignment, which guarantees that positive scores surpass +negative scores to encourage answers with high-quality COTs; b) Constraint, +which keeps the negative scores confined to a reasonable range to prevent the +model degradation. Beyond just the binary positive and negative feedback, the +constraint alignment loss can be seamlessly adapted to the ranking situations +when ranking feedback is accessible. Furthermore, we also delve deeply into +recent ranking-based alignment methods, such as DPO, RRHF, and PRO, and +discover that the constraint, which has been overlooked by these approaches, is +also crucial for their performance. Extensive experiments on four reasoning +benchmarks with both binary and ranking feedback demonstrate the effectiveness +of AFT. + +
+
+ comment: Large Language Models; Reasoning; Alignment +
+
+
+
+
+ + ☆ Evaluating Methods for Ground-Truth-Free Foreign Accent Conversion SC + + +
+ Foreign accent conversion (FAC) is a special application of voice conversion +(VC) which aims to convert the accented speech of a non-native speaker to a +native-sounding speech with the same speaker identity. FAC is difficult since +the native speech from the desired non-native speaker to be used as the +training target is impossible to collect. In this work, we evaluate three +recently proposed methods for ground-truth-free FAC, where all of them aim to +harness the power of sequence-to-sequence (seq2seq) and non-parallel VC models +to properly convert the accent and control the speaker identity. Our +experimental evaluation results show that no single method was significantly +better than the others in all evaluation axes, which is in contrast to +conclusions drawn in previous studies. We also explain the effectiveness of +these methods with the training input and output of the seq2seq model and +examine the design choice of the non-parallel VC model, and show that +intelligibility measures such as word error rates do not correlate well with +subjective accentedness. Finally, our implementation is open-sourced to promote +reproducible research and help future researchers improve upon the compared +systems. + +
+
+ comment: Accepted to the 2023 Asia Pacific Signal and Information Processing + Association Annual Summit and Conference (APSIPA ASC). Demo page: + https://unilight.github.io/Publication-Demos/publications/fac-evaluate. Code: + https://github.com/unilight/seq2seq-vc +
+
+
+
+
+ + ☆ Wordle: A Microcosm of Life. Luck, Skill, Cheating, Loyalty, and + Influence! + + +
+ Wordle is a popular, online word game offered by the New York Times +(nytimes.com). Currently there are some 2 million players of the English +version worldwide. Players have 6 attempts to guess the daily word (target +word) and after each attempt, the player receives color-coded information about +the correctness and position of each letter in the guess. After either a +successful completion of the puzzle or the final unsuccessful attempt, software +can assess the player's luck and skill using Information Theory and can display +data for the first, second, ..., sixth guesses of a random sample of all +players. Recently, I discovered that the latter data is presented in a format +that can easily be copied and pasted into a spreadsheet. I compiled data on +Wordle players' first guesses from May 2023 - August 2023 and inferred some +interesting information about Wordle players. A) Every day, about 0.2-0.5% of +players solve the puzzle in one attempt. Because the odds of guessing the one +of 2,315 possible target words at random is 0.043%, this implies that 4,000 - +10,000 players cheat by obtaining the target word outside of playing the game! +B) At least 1/3 of the players have a favorite starting word, or cycle through +several. And even though players should be aware that target words are never +repeated, most players appear to remain loyal to their starting word even after +its appearance as a target word. C) On August 15, 2023, about 30,000 players +abruptly changed their starting word, presumably based on a crossword puzzle +clue! Wordle players can be influenced! This study goes beyond social media +postings, surveys, and Google Trends to provide solid, quantitative evidence +about cheating in Wordle. + +
+
+
+
+
+ + ☆ Leveraging Label Information for Multimodal Emotion Recognition + + +
+ Multimodal emotion recognition (MER) aims to detect the emotional status of a +given expression by combining the speech and text information. Intuitively, +label information should be capable of helping the model locate the salient +tokens/frames relevant to the specific emotion, which finally facilitates the +MER task. Inspired by this, we propose a novel approach for MER by leveraging +label information. Specifically, we first obtain the representative label +embeddings for both text and speech modalities, then learn the label-enhanced +text/speech representations for each utterance via label-token and label-frame +interactions. Finally, we devise a novel label-guided attentive fusion module +to fuse the label-aware text and speech representations for emotion +classification. Extensive experiments were conducted on the public IEMOCAP +dataset, and experimental results demonstrate that our proposed approach +outperforms existing baselines and achieves new state-of-the-art performance. + +
+
+ comment: Accepted by Interspeech 2023 +
+
+
+
+
+ + ☆ Improving Query-Focused Meeting Summarization with Query-Relevant + Knowledge AACL 2023 + + +
+ Query-Focused Meeting Summarization (QFMS) aims to generate a summary of a +given meeting transcript conditioned upon a query. The main challenges for QFMS +are the long input text length and sparse query-relevant information in the +meeting transcript. In this paper, we propose a knowledge-enhanced two-stage +framework called Knowledge-Aware Summarizer (KAS) to tackle the challenges. In +the first stage, we introduce knowledge-aware scores to improve the +query-relevant segment extraction. In the second stage, we incorporate +query-relevant knowledge in the summary generation. Experimental results on the +QMSum dataset show that our approach achieves state-of-the-art performance. +Further analysis proves the competency of our methods in generating relevant +and faithful summaries. + +
+
+ comment: AACL 2023 Findings +
+
+
+
+
+ + ☆ Bridging Emotion Role Labeling and Appraisal-based Emotion Analysis + + +
+ The term emotion analysis in text subsumes various natural language +processing tasks which have in common the goal to enable computers to +understand emotions. Most popular is emotion classification in which one or +multiple emotions are assigned to a predefined textual unit. While such setting +is appropriate to identify the reader's or author's emotion, emotion role +labeling adds the perspective of mentioned entities and extracts text spans +that correspond to the emotion cause. The underlying emotion theories agree on +one important point; that an emotion is caused by some internal or external +event and comprises several subcomponents, including the subjective feeling and +a cognitive evaluation. We therefore argue that emotions and events are related +in two ways. (1) Emotions are events; and this perspective is the fundament in +NLP for emotion role labeling. (2) Emotions are caused by events; a perspective +that is made explicit with research how to incorporate psychological appraisal +theories in NLP models to interpret events. These two research directions, role +labeling and (event-focused) emotion classification, have by and large been +tackled separately. We contributed to both directions with the projects SEAT +(Structured Multi-Domain Emotion Analysis from Text) and CEAT (Computational +Event Evaluation based on Appraisal Theories for Emotion Analysis), both funded +by the German Research Foundation. In this paper, we consolidate the findings +and point out open research questions. + +
+
+ comment: under review for https://bigpictureworkshop.com/ +
+
+
+
+
+ + ☆ An Automatic Evaluation Framework for Multi-turn Medical Consultations + Capabilities of Large Language Models + + +
+ Large language models (LLMs) have achieved significant success in interacting +with human. However, recent studies have revealed that these models often +suffer from hallucinations, leading to overly confident but incorrect +judgments. This limits their application in the medical domain, where tasks +require the utmost accuracy. This paper introduces an automated evaluation +framework that assesses the practical capabilities of LLMs as virtual doctors +during multi-turn consultations. Consultation tasks are designed to require +LLMs to be aware of what they do not know, to inquire about missing medical +information from patients, and to ultimately make diagnoses. To evaluate the +performance of LLMs for these tasks, a benchmark is proposed by reformulating +medical multiple-choice questions from the United States Medical Licensing +Examinations (USMLE), and comprehensive evaluation metrics are developed and +evaluated on three constructed test sets. A medical consultation training set +is further constructed to improve the consultation ability of LLMs. The results +of the experiments show that fine-tuning with the training set can alleviate +hallucinations and improve LLMs' performance on the proposed benchmark. +Extensive experiments and ablation studies are conducted to validate the +effectiveness and robustness of the proposed framework. + +
+
+ comment: 10 pages, 9figures +
+
+
+
+
+ + ☆ Enhance Multi-domain Sentiment Analysis of Review Texts through + Prompting Strategies + + +
+ Large Language Models (LLMs) have made significant strides in both scientific +research and practical applications. Existing studies have demonstrated the +state-of-the-art (SOTA) performance of LLMs in various natural language +processing tasks. However, the question of how to further enhance LLMs' +performance in specific task using prompting strategies remains a pivotal +concern. This paper explores the enhancement of LLMs' performance in sentiment +analysis through the application of prompting strategies. We formulate the +process of prompting for sentiment analysis tasks and introduce two novel +strategies tailored for sentiment analysis: RolePlaying (RP) prompting and +Chain-of-thought (CoT) prompting. Specifically, we also propose the RP-CoT +prompting strategy which is a combination of RP prompting and CoT prompting. We +conduct comparative experiments on three distinct domain datasets to evaluate +the effectiveness of the proposed sentiment analysis strategies. The results +demonstrate that the adoption of the proposed prompting strategies leads to a +increasing enhancement in sentiment analysis accuracy. Further, the CoT +prompting strategy exhibits a notable impact on implicit sentiment analysis, +with the RP-CoT prompting strategy delivering the most superior performance +among all strategies. + +
+
+
+
+
+ + ☆ Bilevel Scheduled Sampling for Dialogue Generation NLPCC 2023 + + +
+ Exposure bias poses a common challenge in numerous natural language +processing tasks, particularly in the dialog generation. In response to this +issue, researchers have devised various techniques, among which scheduled +sampling has proven to be an effective method for mitigating exposure bias. +However, the existing state-of-the-art scheduled sampling methods solely +consider the current sampling words' quality for threshold truncation sampling, +which overlooks the importance of sentence-level information and the method of +threshold truncation warrants further discussion. In this paper, we propose a +bilevel scheduled sampling model that takes the sentence-level information into +account and incorporates it with word-level quality. To enhance sampling +diversity and improve the model's adaptability, we propose a smooth function +that maps the combined result of sentence-level and word-level information to +an appropriate range, and employ probabilistic sampling based on the mapped +values instead of threshold truncation. Experiments conducted on the +DailyDialog and PersonaChat datasets demonstrate the effectiveness of our +proposed methods, which significantly alleviate the exposure bias problem and +outperform state-of-the-art scheduled sampling methods. + +
+
+ comment: 13 pages, 4 figures, Natural Language Processing and Chinese + Computing(NLPCC 2023) accepted +
+
+
+
+
+ + ☆ TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression + For On-device ASR Models ICASSP 2024 + + +
+ Automatic Speech Recognition (ASR) models need to be optimized for specific +hardware before they can be deployed on devices. This can be done by tuning the +model's hyperparameters or exploring variations in its architecture. +Re-training and re-validating models after making these changes can be a +resource-intensive task. This paper presents TODM (Train Once Deploy Many), a +new approach to efficiently train many sizes of hardware-friendly on-device ASR +models with comparable GPU-hours to that of a single training job. TODM +leverages insights from prior work on Supernet, where Recurrent Neural Network +Transducer (RNN-T) models share weights within a Supernet. It reduces layer +sizes and widths of the Supernet to obtain subnetworks, making them smaller +models suitable for all hardware types. We introduce a novel combination of +three techniques to improve the outcomes of the TODM Supernet: adaptive +dropouts, an in-place Alpha-divergence knowledge distillation, and the use of +ScaledAdam optimizer. We validate our approach by comparing Supernet-trained +versus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using +LibriSpeech. Results demonstrate that our TODM Supernet either matches or +surpasses the performance of manually tuned models by up to a relative of 3% +better in word error rate (WER), while efficiently keeping the cost of training +many models at a small constant. + +
+
+ comment: Meta AI; Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ CodeApex: A Bilingual Programming Evaluation Benchmark for Large + Language Models + + +
+ With the emergence of Large Language Models (LLMs), there has been a +significant improvement in the programming capabilities of models, attracting +growing attention from researchers. We propose CodeApex, a bilingual benchmark +dataset focusing on the programming comprehension and code generation abilities +of LLMs. CodeApex comprises three types of multiple-choice questions: +conceptual understanding, commonsense reasoning, and multi-hop reasoning, +designed to evaluate LLMs on programming comprehension tasks. Additionally, +CodeApex utilizes algorithmic questions and corresponding test cases to assess +the code quality generated by LLMs. We evaluate 14 state-of-the-art LLMs, +including both general-purpose and specialized models. GPT exhibits the best +programming capabilities, achieving approximate accuracies of 50% and 56% on +the two tasks, respectively. There is still significant room for improvement in +programming tasks. We hope that CodeApex can serve as a reference for +evaluating the coding capabilities of LLMs, further promoting their development +and growth. Datasets are released at +\url{https://github.com/APEXLAB/CodeApex.git}. CodeApex submission website is +\url{https://apex.sjtu.edu.cn/codeapex/}. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ QuantEase: Optimization-based Quantization for Language Models -- An + Efficient and Intuitive Algorithm + + +
+ With the rising popularity of Large Language Models (LLMs), there has been an +increasing interest in compression techniques that enable their efficient +deployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs. +Drawing from recent advances, our work introduces QuantEase, a layer-wise +quantization framework where individual layers undergo separate quantization. +The problem is framed as a discrete-structured non-convex optimization, +prompting the development of algorithms rooted in Coordinate Descent (CD) +techniques. These CD-based methods provide high-quality solutions to the +complex non-convex layer-wise quantization problems. Notably, our CD-based +approach features straightforward updates, relying solely on matrix and vector +operations, circumventing the need for matrix inversion or decomposition. We +also explore an outlier-aware variant of our approach, allowing for retaining +significant weights (outliers) with complete precision. Our proposal attains +state-of-the-art performance in terms of perplexity and zero-shot accuracy in +empirical evaluations across various LLMs and datasets, with relative +improvements up to 15% over methods such as GPTQ. Particularly noteworthy is +our outlier-aware algorithm's capability to achieve near or sub-3-bit +quantization of LLMs with an acceptable drop in accuracy, obviating the need +for non-uniform quantization or grouping techniques, improving upon methods +such as SpQR by up to two times in terms of perplexity. + +
+
+
+
+
+ + ☆ On the Planning, Search, and Memorization Capabilities of Large Language + Models + + +
+ The rapid advancement of large language models, such as the Generative +Pre-trained Transformer (GPT) series, has had significant implications across +various disciplines. In this study, we investigate the potential of the +state-of-the-art large language model (GPT-4) for planning tasks. We explore +its effectiveness in multiple planning subfields, highlighting both its +strengths and limitations. Through a comprehensive examination, we identify +areas where large language models excel in solving planning problems and reveal +the constraints that limit their applicability. Our empirical analysis focuses +on GPT-4's performance in planning domain extraction, graph search path +planning, and adversarial planning. We then propose a way of fine-tuning a +domain-specific large language model to improve its Chain of Thought (CoT) +capabilities for the above-mentioned tasks. The results provide valuable +insights into the potential applications of large language models in the +planning domain and pave the way for future research to overcome their +limitations and expand their capabilities. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ☆ Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction + Tuning + + +
+ We present CM3Leon (pronounced "Chameleon"), a retrieval-augmented, +token-based, decoder-only multi-modal language model capable of generating and +infilling both text and images. CM3Leon uses the CM3 multi-modal architecture +but additionally shows the extreme benefits of scaling up and tuning on more +diverse instruction-style data. It is the first multi-modal model trained with +a recipe adapted from text-only language models, including a large-scale +retrieval-augmented pre-training stage and a second multi-task supervised +fine-tuning (SFT) stage. It is also a general-purpose model that can do both +text-to-image and image-to-text generation, allowing us to introduce +self-contained contrastive decoding methods that produce high-quality outputs. +Extensive experiments demonstrate that this recipe is highly effective for +multi-modal models. CM3Leon achieves state-of-the-art performance in +text-to-image generation with 5x less training compute than comparable methods +(zero-shot MS-COCO FID of 4.88). After SFT, CM3Leon can also demonstrate +unprecedented levels of controllability in tasks ranging from language-guided +image editing to image-controlled generation and segmentation. + +
+
+
+
+
+ + ☆ Automating Behavioral Testing in Machine Translation + + +
+ Behavioral testing in NLP allows fine-grained evaluation of systems by +examining their linguistic capabilities through the analysis of input-output +behavior. Unfortunately, existing work on behavioral testing in Machine +Translation (MT) is currently restricted to largely handcrafted tests covering +a limited range of capabilities and languages. To address this limitation, we +propose to use Large Language Models (LLMs) to generate a diverse set of source +sentences tailored to test the behavior of MT models in a range of situations. +We can then verify whether the MT model exhibits the expected behavior through +matching candidate sets that are also generated using LLMs. Our approach aims +to make behavioral testing of MT systems practical while requiring only minimal +human effort. In our experiments, we apply our proposed evaluation framework to +assess multiple available MT systems, revealing that while in general +pass-rates follow the trends observable from traditional accuracy-based +metrics, our method was able to uncover several important differences and +potential bugs that go unnoticed when relying only on accuracy. + +
+
+
+
+
+ + ♻ ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained language models like ChatGPT have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks. Moreover, in bioinformatics, generating +functional programs poses additional notable challenges due to the amount of +domain knowledge, the need for complicated data operations, and intricate +functional dependencies between the operations. Here, we present BioCoder, a +benchmark developed to evaluate existing pre-trained models in generating +bioinformatics code. In relation to function-code generation, BioCoder covers +potential package dependencies, class declarations, and global variables. It +incorporates 1026 functions and 1243 methods in Python and Java from GitHub and +253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing +framework for evaluation, and we have applied it to evaluate many models +including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, +InstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes +the importance of domain knowledge, pragmatic code generation, and contextual +understanding. Our dataset, benchmark, Docker images, and scripts required for +testing are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ♻ ☆ InterviewBot: Real-Time End-to-End Dialogue System to Interview Students + for College Admission + + +
+ We present the InterviewBot that dynamically integrates conversation history +and customized topics into a coherent embedding space to conduct 10 mins +hybrid-domain (open and closed) conversations with foreign students applying to +U.S. colleges for assessing their academic and cultural readiness. To build a +neural-based end-to-end dialogue model, 7,361 audio recordings of +human-to-human interviews are automatically transcribed, where 440 are manually +corrected for finetuning and evaluation. To overcome the input/output size +limit of a transformer-based encoder-decoder model, two new methods are +proposed, context attention and topic storing, allowing the model to make +relevant and consistent interactions. Our final model is tested both +statistically by comparing its responses to the interview data and dynamically +by inviting professional interviewers and various students to interact with it +in real-time, finding it highly satisfactory in fluency and context awareness. + +
+
+
+
+
+ + ♻ ☆ LM-Infinite: Simple On-the-Fly Length Generalization for Large Language + Models + + +
+ In recent years, there have been remarkable advancements in the performance +of Transformer-based Large Language Models (LLMs) across various domains. As +these LLMs are deployed for increasingly complex tasks, they often face the +need to conduct longer reasoning processes or understand larger contexts. In +these situations, the length generalization failure of LLMs on long sequences +becomes more prominent. Most pre-training schemes truncate training sequences +to a fixed length. LLMs often struggle to generate fluent and coherent texts, +let alone carry out downstream tasks, after longer contexts, even with relative +positional encoding designed to cope with this problem. Common solutions such +as finetuning on longer corpora often involve daunting hardware and time costs +and require careful training process design. To more efficiently leverage the +generation capacity of existing LLMs, we theoretically and empirically +investigate the main out-of-distribution (OOD) factors contributing to this +problem. Inspired by this diagnosis, we propose a simple yet effective solution +for on-the-fly length generalization, LM-Infinite. It involves only a +$\Lambda$-shaped attention mask (to avoid excessive attended tokens) and a +distance limit (to avoid unseen distances) while requiring no parameter updates +or learning. We find it applicable to a variety of LLMs using relative-position +encoding methods. LM-Infinite is computationally efficient with $O(n)$ time and +space, and demonstrates consistent text generation fluency and quality to as +long as 32k tokens on ArXiv and OpenWebText2 datasets, with 2.72x decoding +speedup. On downstream tasks such as passkey retrieval, it continues to work on +inputs much longer than training lengths where vanilla models fail immediately. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ AMR4NLI: Interpretable and robust NLI measures from semantic graphs + + +
+ The task of natural language inference (NLI) asks whether a given premise +(expressed in NL) entails a given NL hypothesis. NLI benchmarks contain human +ratings of entailment, but the meaning relationships driving these ratings are +not formalized. Can the underlying sentence pair relationships be made more +explicit in an interpretable yet robust fashion? We compare semantic structures +to represent premise and hypothesis, including sets of contextualized +embeddings and semantic graphs (Abstract Meaning Representations), and measure +whether the hypothesis is a semantic substructure of the premise, utilizing +interpretable metrics. Our evaluation on three English benchmarks finds value +in both contextualized embeddings and semantic graphs; moreover, they provide +complementary signals, and can be leveraged together in a hybrid model. + +
+
+ comment: International Conference on Computational Semantics (IWCS 2023); v2 + fixes an imprecise sentence below Eq. 5 +
+
+
+
+
+ + ♻ ☆ Identifying depression-related topics in smartphone-collected + free-response speech recordings using an automatic speech recognition system + and a deep learning topic model + + +
+ Language use has been shown to correlate with depression, but large-scale +validation is needed. Traditional methods like clinic studies are expensive. +So, natural language processing has been employed on social media to predict +depression, but limitations remain-lack of validated labels, biased user +samples, and no context. Our study identified 29 topics in 3919 +smartphone-collected speech recordings from 265 participants using the Whisper +tool and BERTopic model. Six topics with a median PHQ-8 greater than or equal +to 10 were regarded as risk topics for depression: No Expectations, Sleep, +Mental Therapy, Haircut, Studying, and Coursework. To elucidate the topic +emergence and associations with depression, we compared behavioral (from +wearables) and linguistic characteristics across identified topics. The +correlation between topic shifts and changes in depression severity over time +was also investigated, indicating the importance of longitudinally monitoring +language use. We also tested the BERTopic model on a similar smaller dataset +(356 speech recordings from 57 participants), obtaining some consistent +results. In summary, our findings demonstrate specific speech topics may +indicate depression severity. The presented data-driven workflow provides a +practical approach to collecting and analyzing large-scale speech data from +real-world settings for digital health research. + +
+
+
+
+
+ + ♻ ☆ Link Prediction for Wikipedia Articles as a Natural Language Inference + Task + + +
+ Link prediction task is vital to automatically understanding the structure of +large knowledge bases. In this paper, we present our system to solve this task +at the Data Science and Advanced Analytics 2023 Competition "Efficient and +Effective Link Prediction" (DSAA-2023 Competition) with a corpus containing +948,233 training and 238,265 for public testing. This paper introduces an +approach to link prediction in Wikipedia articles by formulating it as a +natural language inference (NLI) task. Drawing inspiration from recent +advancements in natural language processing and understanding, we cast link +prediction as an NLI task, wherein the presence of a link between two articles +is treated as a premise, and the task is to determine whether this premise +holds based on the information presented in the articles. We implemented our +system based on the Sentence Pair Classification for Link Prediction for the +Wikipedia Articles task. Our system achieved 0.99996 Macro F1-score and 1.00000 +Macro F1-score for the public and private test sets, respectively. Our team +UIT-NLP ranked 3rd in performance on the private test set, equal to the scores +of the first and second places. Our code is publicly for research purposes. + +
+
+ comment: Accepted at the 10th IEEE International Conference On Data Science + And Advanced Analytics (DSAA 2023) +
+
+
+
+
+ + ♻ ☆ Reasoning before Responding: Integrating Commonsense-based Causality + Explanation for Empathetic Response Generation SIGDIAL 2023 + + +
+ Recent approaches to empathetic response generation try to incorporate +commonsense knowledge or reasoning about the causes of emotions to better +understand the user's experiences and feelings. However, these approaches +mainly focus on understanding the causalities of context from the user's +perspective, ignoring the system's perspective. In this paper, we propose a +commonsense-based causality explanation approach for diverse empathetic +response generation that considers both the user's perspective (user's desires +and reactions) and the system's perspective (system's intentions and +reactions). We enhance ChatGPT's ability to reason for the system's perspective +by integrating in-context learning with commonsense knowledge. Then, we +integrate the commonsense-based causality explanation with both ChatGPT and a +T5-based model. Experimental evaluations demonstrate that our method +outperforms other comparable methods on both automatic and human evaluations. + +
+
+ comment: Accepted by the 24th Meeting of the Special Interest Group on + Discourse and Dialogue (SIGDIAL 2023) +
+
+
+
+
+ + ♻ ☆ Towards Generalist Foundation Model for Radiology + + +
+ In this study, we aim to initiate the development of Radiology Foundation +Model, termed as RadFM.We consider the construction of foundational models from +the perspectives of dataset construction, model design, and thorough +evaluation. Our contribution can be concluded as follows: (i), we construct a +large-scale Medical Multi-modal Dataset, MedMD, which consists of 16M 2D and 3D +medical scans with high-quality text descriptions or reports across various +data formats, modalities, and tasks, covering over 5000 distinct diseases. To +the best of our knowledge, this is the first large-scale, high-quality, medical +visual-language dataset, with both 2D and 3D scans; (ii ), we propose an +architecture that enables visually conditioned generative pre-training, i.e., +allowing for integration of text input with 2D or 3D medical scans, and +generate responses for diverse radiologic tasks. The model was initially +pre-trained on MedMD and subsequently fine-tuned on the domain-specific +dataset, which is a radiologic cleaned version of MedMD, containing 3M +radiologic visual-language pairs, termed as RadMD; (iii), we propose a new +evaluation benchmark, RadBench, that comprises five tasks, including modality +recognition, disease diagnosis, visual question answering, report generation +and rationale diagnosis, aiming to comprehensively assess the capability of +foundation models in handling practical clinical problems. We conduct both +automatic and human evaluation on RadBench, in both cases, RadFM significantly +outperforms existing multi-modal foundation models. The codes, data, and model +checkpoint will all be made publicly available to promote further research and +development in the field. + +
+
+
+
+
+ + ♻ ☆ LLaMA-Reviewer: Advancing Code Review Automation with Large Language + Models through Parameter-Efficient Fine-Tuning + + +
+ The automation of code review activities, a long-standing pursuit in software +engineering, has been primarily addressed by numerous domain-specific +pre-trained models. Despite their success, these models frequently demand +extensive resources for pre-training from scratch. In contrast, Large Language +Models (LLMs) provide an intriguing alternative, given their remarkable +capabilities when supplemented with domain-specific knowledge. However, their +potential for automating code review tasks remains largely unexplored. + In response to this research gap, we present LLaMA-Reviewer, an innovative +framework that leverages the capabilities of LLaMA, a popular LLM, in the realm +of code review. Mindful of resource constraints, this framework employs +parameter-efficient fine-tuning (PEFT) methods, delivering high performance +while using less than 1% of trainable parameters. + An extensive evaluation of LLaMA-Reviewer is conducted on two diverse, +publicly available datasets. Notably, even with the smallest LLaMA base model +consisting of 6.7B parameters and a limited number of tuning epochs, +LLaMA-Reviewer equals the performance of existing code-review-focused models. + The ablation experiments provide insights into the influence of various +fine-tuning process components, including input representation, instruction +tuning, and different PEFT methods. To foster continuous progress in this +field, the code and all PEFT-weight plugins have been made open-source. + +
+
+ comment: Accepted to the 34th IEEE International Symposium on Software + Reliability Engineering (ISSRE 2023) +
+
+
+
+
+ + ♻ ☆ Is the U.S. Legal System Ready for AI's Challenges to Human Values? + + +
+ Our interdisciplinary study investigates how effectively U.S. laws confront +the challenges posed by Generative AI to human values. Through an analysis of +diverse hypothetical scenarios crafted during an expert workshop, we have +identified notable gaps and uncertainties within the existing legal framework +regarding the protection of fundamental values, such as privacy, autonomy, +dignity, diversity, equity, and physical/mental well-being. Constitutional and +civil rights, it appears, may not provide sufficient protection against +AI-generated discriminatory outputs. Furthermore, even if we exclude the +liability shield provided by Section 230, proving causation for defamation and +product liability claims is a challenging endeavor due to the intricate and +opaque nature of AI systems. To address the unique and unforeseeable threats +posed by Generative AI, we advocate for legal frameworks that evolve to +recognize new threats and provide proactive, auditable guidelines to industry +stakeholders. Addressing these issues requires deep interdisciplinary +collaborations to identify harms, values, and mitigation strategies. + +
+
+ comment: 25 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ BatchPrompt: Accomplish more with less + + +
+ As the ever-increasing token limits of large language models (LLMs) have +enabled long context as input, prompting with single data samples might no +longer an efficient way. A straightforward strategy improving efficiency is to +batch data within the token limit (e.g., 8k for gpt-3.5-turbo; 32k for GPT-4), +which we call BatchPrompt. We have two initial observations for prompting with +batched data. First, we find that prompting with batched data in longer +contexts will inevitably lead to worse performance, compared to single-data +prompting. Second, the performance of the language model is significantly +correlated with the positions and order of the batched data, due to the +corresponding change in decoder context. To retain efficiency and overcome +performance loss, we propose Batch Permutation and Ensembling (BPE), and a +novel Self-reflection-guided EArly Stopping (SEAS) technique. Our comprehensive +experimental evaluation demonstrates that BPE can boost the performance of +BatchPrompt with a striking margin on a range of popular NLP tasks, including +question answering (Boolq), textual entailment (RTE), and duplicate questions +identification (QQP). These performances are even competitive with/higher than +single-data prompting(SinglePrompt), while BatchPrompt requires much fewer LLM +calls and input tokens (For SinglePrompt v.s. BatchPrompt with batch size 32, +using just 9%-16% the number of LLM calls, Boolq accuracy 90.6% to 90.9% with +27.4% tokens, QQP accuracy 87.2% to 88.4% with 18.6% tokens, RTE accuracy 91.5% +to 91.1% with 30.8% tokens). To the best of our knowledge, this is the first +work to technically improve prompting efficiency of large language models. We +hope our simple yet effective approach will shed light on the future research +of large language models. The code will be released. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Learning to Select from Multiple Options AAAI 2023 + + +
+ Many NLP tasks can be regarded as a selection problem from a set of options, +such as classification tasks, multi-choice question answering, etc. Textual +entailment (TE) has been shown as the state-of-the-art (SOTA) approach to +dealing with those selection problems. TE treats input texts as premises (P), +options as hypotheses (H), then handles the selection problem by modeling (P, +H) pairwise. Two limitations: first, the pairwise modeling is unaware of other +options, which is less intuitive since humans often determine the best options +by comparing competing candidates; second, the inference process of pairwise TE +is time-consuming, especially when the option space is large. To deal with the +two issues, this work first proposes a contextualized TE model (Context-TE) by +appending other k options as the context of the current (P, H) modeling. +Context-TE is able to learn more reliable decision for the H since it considers +various context. Second, we speed up Context-TE by coming up with Parallel-TE, +which learns the decisions of multiple options simultaneously. Parallel-TE +significantly improves the inference speed while keeping comparable performance +with Context-TE. Our methods are evaluated on three tasks (ultra-fine entity +typing, intent detection and multi-choice QA) that are typical selection +problems with different sizes of options. Experiments show our models set new +SOTA performance; particularly, Parallel-TE is faster than the pairwise TE by k +times in inference. Our code is publicly available at +https://github.com/jiangshdd/LearningToSelect. + +
+
+ comment: Accepted by AAAI 2023 +
+
+
+
+
+ + ♻ ☆ Large-scale Language Model Rescoring on Long-form Data ICASSP 2023 + + +
+ In this work, we study the impact of Large-scale Language Models (LLM) on +Automated Speech Recognition (ASR) of YouTube videos, which we use as a source +for long-form ASR. We demonstrate up to 8\% relative reduction in Word Error +Eate (WER) on US English (en-us) and code-switched Indian English (en-in) +long-form ASR test sets and a reduction of up to 30\% relative on Salient Term +Error Rate (STER) over a strong first-pass baseline that uses a maximum-entropy +based language model. Improved lattice processing that results in a lattice +with a proper (non-tree) digraph topology and carrying context from the 1-best +hypothesis of the previous segment(s) results in significant wins in rescoring +with LLMs. We also find that the gains in performance from the combination of +LLMs trained on vast quantities of available data (such as C4) and conventional +neural LMs is additive and significantly outperforms a strong first-pass +baseline with a maximum entropy LM. + Copyright 2023 IEEE. Personal use of this material is permitted. Permission +from IEEE must be obtained for all other uses, in any current or future media, +including reprinting/republishing this material for advertising or promotional +purposes, creating new collective works, for resale or redistribution to +servers or lists, or reuse of any copyrighted component of this work in other +works. + +
+
+ comment: 5 pages, accepted in ICASSP 2023 +
+
+
+
+
+ + ♻ ☆ ChatGPT is on the Horizon: Could a Large Language Model be Suitable for + Intelligent Traffic Safety Research and Applications? + + +
+ ChatGPT embarks on a new era of artificial intelligence and will +revolutionize the way we approach intelligent traffic safety systems. This +paper begins with a brief introduction about the development of large language +models (LLMs). Next, we exemplify using ChatGPT to address key traffic safety +issues. Furthermore, we discuss the controversies surrounding LLMs, raise +critical questions for their deployment, and provide our solutions. Moreover, +we propose an idea of multi-modality representation learning for smarter +traffic safety decision-making and open more questions for application +improvement. We believe that LLM will both shape and potentially facilitate +components of traffic safety research. + +
+
+ comment: Submitted to Nature - Machine Intelligence (Revised and Extended) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 123 + +
+
+
+ + ☆ GO-SLAM: Global Optimization for Consistent 3D Instant Reconstruction ICCV 2023 + + +
+ Neural implicit representations have recently demonstrated compelling results +on dense Simultaneous Localization And Mapping (SLAM) but suffer from the +accumulation of errors in camera tracking and distortion in the reconstruction. +Purposely, we present GO-SLAM, a deep-learning-based dense visual SLAM +framework globally optimizing poses and 3D reconstruction in real-time. Robust +pose estimation is at its core, supported by efficient loop closing and online +full bundle adjustment, which optimize per frame by utilizing the learned +global geometry of the complete history of input frames. Simultaneously, we +update the implicit and continuous surface representation on-the-fly to ensure +global consistency of 3D reconstruction. Results on various synthetic and +real-world datasets demonstrate that GO-SLAM outperforms state-of-the-art +approaches at tracking robustness and reconstruction accuracy. Furthermore, +GO-SLAM is versatile and can run with monocular, stereo, and RGB-D input. + +
+
+ comment: ICCV 2023. Code: https://github.com/youmi-zym/GO-SLAM - Project Page: + https://youmi-zym.github.io/projects/GO-SLAM/ +
+
+
+
+
+ + ☆ Efficient RL via Disentangled Environment and Agent Representations ICML 2023 + + +
+ Agents that are aware of the separation between themselves and their +environments can leverage this understanding to form effective representations +of visual input. We propose an approach for learning such structured +representations for RL algorithms, using visual knowledge of the agent, such as +its shape or mask, which is often inexpensive to obtain. This is incorporated +into the RL objective using a simple auxiliary loss. We show that our method, +Structured Environment-Agent Representations, outperforms state-of-the-art +model-free approaches over 18 different challenging visual simulation +environments spanning 5 different robots. Website at https://sear-rl.github.io/ + +
+
+ comment: ICML 2023. Website at https://sear-rl.github.io/ +
+
+
+
+
+ + ☆ ReliTalk: Relightable Talking Portrait Generation from a Single Video + + +
+ Recent years have witnessed great progress in creating vivid audio-driven +portraits from monocular videos. However, how to seamlessly adapt the created +video avatars to other scenarios with different backgrounds and lighting +conditions remains unsolved. On the other hand, existing relighting studies +mostly rely on dynamically lighted or multi-view data, which are too expensive +for creating video portraits. To bridge this gap, we propose ReliTalk, a novel +framework for relightable audio-driven talking portrait generation from +monocular videos. Our key insight is to decompose the portrait's reflectance +from implicitly learned audio-driven facial normals and images. Specifically, +we involve 3D facial priors derived from audio features to predict delicate +normal maps through implicit functions. These initially predicted normals then +take a crucial part in reflectance decomposition by dynamically estimating the +lighting condition of the given video. Moreover, the stereoscopic face +representation is refined using the identity-consistent loss under simulated +multiple lighting conditions, addressing the ill-posed problem caused by +limited views available from a single monocular video. Extensive experiments +validate the superiority of our proposed framework on both real and synthetic +datasets. Our code is released in https://github.com/arthur-qiu/ReliTalk. + +
+
+
+
+
+ + ☆ Building a Winning Team: Selecting Source Model Ensembles using a + Submodular Transferability Estimation Approach ICCV 2023 + + +
+ Estimating the transferability of publicly available pretrained models to a +target task has assumed an important place for transfer learning tasks in +recent years. Existing efforts propose metrics that allow a user to choose one +model from a pool of pre-trained models without having to fine-tune each model +individually and identify one explicitly. With the growth in the number of +available pre-trained models and the popularity of model ensembles, it also +becomes essential to study the transferability of multiple-source models for a +given target task. The few existing efforts study transferability in such +multi-source ensemble settings using just the outputs of the classification +layer and neglect possible domain or task mismatch. Moreover, they overlook the +most important factor while selecting the source models, viz., the cohesiveness +factor between them, which can impact the performance and confidence in the +prediction of the ensemble. To address these gaps, we propose a novel Optimal +tranSport-based suBmOdular tRaNsferability metric (OSBORN) to estimate the +transferability of an ensemble of models to a downstream task. OSBORN +collectively accounts for image domain difference, task difference, and +cohesiveness of models in the ensemble to provide reliable estimates of +transferability. We gauge the performance of OSBORN on both image +classification and semantic segmentation tasks. Our setup includes 28 source +datasets, 11 target datasets, 5 model architectures, and 2 pre-training +methods. We benchmark our method against current state-of-the-art metrics +MS-LEEP and E-LEEP, and outperform them consistently using the proposed +approach. + +
+
+ comment: To appear at ICCV 2023 +
+
+
+
+
+ + ☆ EgoPCA: A New Framework for Egocentric Hand-Object Interaction + Understanding ICCV 2023 + + +
+ With the surge in attention to Egocentric Hand-Object Interaction (Ego-HOI), +large-scale datasets such as Ego4D and EPIC-KITCHENS have been proposed. +However, most current research is built on resources derived from third-person +video action recognition. This inherent domain gap between first- and +third-person action videos, which have not been adequately addressed before, +makes current Ego-HOI suboptimal. This paper rethinks and proposes a new +framework as an infrastructure to advance Ego-HOI recognition by Probing, +Curation and Adaption (EgoPCA). We contribute comprehensive pre-train sets, +balanced test sets and a new baseline, which are complete with a +training-finetuning strategy. With our new framework, we not only achieve +state-of-the-art performance on Ego-HOI benchmarks but also build several new +and effective mechanisms and settings to advance further research. We believe +our data and the findings will pave a new way for Ego-HOI understanding. Code +and data are available at https://mvig-rhos.com/ego_pca + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Doppelgangers: Learning to Disambiguate Images of Similar Structures ICCV 2023 + + +
+ We consider the visual disambiguation task of determining whether a pair of +visually similar images depict the same or distinct 3D surfaces (e.g., the same +or opposite sides of a symmetric building). Illusory image matches, where two +images observe distinct but visually similar 3D surfaces, can be challenging +for humans to differentiate, and can also lead 3D reconstruction algorithms to +produce erroneous results. We propose a learning-based approach to visual +disambiguation, formulating it as a binary classification task on image pairs. +To that end, we introduce a new dataset for this problem, Doppelgangers, which +includes image pairs of similar structures with ground truth labels. We also +design a network architecture that takes the spatial distribution of local +keypoints and matches as input, allowing for better reasoning about both local +and global cues. Our evaluation shows that our method can distinguish illusory +matches in difficult cases, and can be integrated into SfM pipelines to produce +correct, disambiguated 3D reconstructions. See our project page for our code, +datasets, and more results: http://doppelgangers-3d.github.io/. + +
+
+ comment: Published in ICCV 2023 (Oral); Project page: + http://doppelgangers-3d.github.io/ +
+
+
+
+
+ + ☆ Generating Realistic Images from In-the-wild Sounds ICCV 2023 + + +
+ Representing wild sounds as images is an important but challenging task due +to the lack of paired datasets between sound and images and the significant +differences in the characteristics of these two modalities. Previous studies +have focused on generating images from sound in limited categories or music. In +this paper, we propose a novel approach to generate images from in-the-wild +sounds. First, we convert sound into text using audio captioning. Second, we +propose audio attention and sentence attention to represent the rich +characteristics of sound and visualize the sound. Lastly, we propose a direct +sound optimization with CLIPscore and AudioCLIP and generate images with a +diffusion-based model. In experiments, it shows that our model is able to +generate high quality images from wild sounds and outperforms baselines in both +quantitative and qualitative evaluations on wild audio datasets. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Voice Morphing: Two Identities in One Voice + + +
+ In a biometric system, each biometric sample or template is typically +associated with a single identity. However, recent research has demonstrated +the possibility of generating "morph" biometric samples that can successfully +match more than a single identity. Morph attacks are now recognized as a +potential security threat to biometric systems. However, most morph attacks +have been studied on biometric modalities operating in the image domain, such +as face, fingerprint, and iris. In this preliminary work, we introduce Voice +Identity Morphing (VIM) - a voice-based morph attack that can synthesize speech +samples that impersonate the voice characteristics of a pair of individuals. +Our experiments evaluate the vulnerabilities of two popular speaker recognition +systems, ECAPA-TDNN and x-vector, to VIM, with a success rate (MMPMR) of over +80% at a false match rate of 1% on the Librispeech dataset. + +
+
+ comment: Accepted oral paper at BIOSIG 2023 +
+
+
+
+
+ + ☆ Prototype-based Dataset Comparison ICCV 2023 + + +
+ Dataset summarisation is a fruitful approach to dataset inspection. However, +when applied to a single dataset the discovery of visual concepts is restricted +to those most prominent. We argue that a comparative approach can expand upon +this paradigm to enable richer forms of dataset inspection that go beyond the +most prominent concepts. To enable dataset comparison we present a module that +learns concept-level prototypes across datasets. We leverage self-supervised +learning to discover these prototypes without supervision, and we demonstrate +the benefits of our approach in two case-studies. Our findings show that +dataset comparison extends dataset inspection and we hope to encourage more +works in this direction. Code and usage instructions available at +https://github.com/Nanne/ProtoSim + +
+
+ comment: To be presented at ICCV 2023 +
+
+
+
+
+ + ☆ STEP -- Towards Structured Scene-Text Spotting + + +
+ We introduce the structured scene-text spotting task, which requires a +scene-text OCR system to spot text in the wild according to a query regular +expression. Contrary to generic scene text OCR, structured scene-text spotting +seeks to dynamically condition both scene text detection and recognition on +user-provided regular expressions. To tackle this task, we propose the +Structured TExt sPotter (STEP), a model that exploits the provided text +structure to guide the OCR process. STEP is able to deal with regular +expressions that contain spaces and it is not bound to detection at the +word-level granularity. Our approach enables accurate zero-shot structured text +spotting in a wide variety of real-world reading scenarios and is solely +trained on publicly available data. To demonstrate the effectiveness of our +approach, we introduce a new challenging test dataset that contains several +types of out-of-vocabulary structured text, reflecting important reading +applications of fields such as prices, dates, serial numbers, license plates +etc. We demonstrate that STEP can provide specialised OCR performance on demand +in all tested scenarios. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ☆ Generating Infinite-Resolution Texture using GANs with Patch-by-Patch + Paradigm + + +
+ In this paper, we introduce a novel approach for generating texture images of +infinite resolutions using Generative Adversarial Networks (GANs) based on a +patch-by-patch paradigm. Existing texture synthesis techniques often rely on +generating a large-scale texture using a one-forward pass to the generating +model, this limits the scalability and flexibility of the generated images. In +contrast, the proposed approach trains GANs models on a single texture image to +generate relatively small patches that are locally correlated and can be +seamlessly concatenated to form a larger image while using a constant GPU +memory footprint. Our method learns the local texture structure and is able to +generate arbitrary-size textures, while also maintaining coherence and +diversity. The proposed method relies on local padding in the generator to +ensure consistency between patches and utilizes spatial stochastic modulation +to allow for local variations and diversity within the large-scale image. +Experimental results demonstrate superior scalability compared to existing +approaches while maintaining visual coherence of generated textures. + +
+
+
+
+
+ + ☆ DEEPBEAS3D: Deep Learning and B-Spline Explicit Active Surfaces + + +
+ Deep learning-based automatic segmentation methods have become +state-of-the-art. However, they are often not robust enough for direct clinical +application, as domain shifts between training and testing data affect their +performance. Failure in automatic segmentation can cause sub-optimal results +that require correction. To address these problems, we propose a novel 3D +extension of an interactive segmentation framework that represents a +segmentation from a convolutional neural network (CNN) as a B-spline explicit +active surface (BEAS). BEAS ensures segmentations are smooth in 3D space, +increasing anatomical plausibility, while allowing the user to precisely edit +the 3D surface. We apply this framework to the task of 3D segmentation of the +anal sphincter complex (AS) from transperineal ultrasound (TPUS) images, and +compare it to the clinical tool used in the pelvic floor disorder clinic (4D +View VOCAL, GE Healthcare; Zipf, Austria). Experimental results show that: 1) +the proposed framework gives the user explicit control of the surface contour; +2) the perceived workload calculated via the NASA-TLX index was reduced by 30% +compared to VOCAL; and 3) it required 7 0% (170 seconds) less user time than +VOCAL (p< 0.00001) + +
+
+ comment: 4 pages, 3 figures, 1 table, conference +
+
+
+
+
+ + ☆ TiAVox: Time-aware Attenuation Voxels for Sparse-view 4D DSA + Reconstruction + + +
+ Four-dimensional Digital Subtraction Angiography (4D DSA) plays a critical +role in the diagnosis of many medical diseases, such as Arteriovenous +Malformations (AVM) and Arteriovenous Fistulas (AVF). Despite its significant +application value, the reconstruction of 4D DSA demands numerous views to +effectively model the intricate vessels and radiocontrast flow, thereby +implying a significant radiation dose. To address this high radiation issue, we +propose a Time-aware Attenuation Voxel (TiAVox) approach for sparse-view 4D DSA +reconstruction, which paves the way for high-quality 4D imaging. Additionally, +2D and 3D DSA imaging results can be generated from the reconstructed 4D DSA +images. TiAVox introduces 4D attenuation voxel grids, which reflect attenuation +properties from both spatial and temporal dimensions. It is optimized by +minimizing discrepancies between the rendered images and sparse 2D DSA images. +Without any neural network involved, TiAVox enjoys specific physical +interpretability. The parameters of each learnable voxel represent the +attenuation coefficients. We validated the TiAVox approach on both clinical and +simulated datasets, achieving a 31.23 Peak Signal-to-Noise Ratio (PSNR) for +novel view synthesis using only 30 views on the clinically sourced dataset, +whereas traditional Feldkamp-Davis-Kress methods required 133 views. Similarly, +with merely 10 views from the synthetic dataset, TiAVox yielded a PSNR of 34.32 +for novel view synthesis and 41.40 for 3D reconstruction. We also executed +ablation studies to corroborate the essential components of TiAVox. The code +will be publically available. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ CIEM: Contrastive Instruction Evaluation Method for Better Instruction + Tuning + + +
+ Nowadays, the research on Large Vision-Language Models (LVLMs) has been +significantly promoted thanks to the success of Large Language Models (LLM). +Nevertheless, these Vision-Language Models (VLMs) are suffering from the +drawback of hallucination -- due to insufficient understanding of vision and +language modalities, VLMs may generate incorrect perception information when +doing downstream applications, for example, captioning a non-existent entity. +To address the hallucination phenomenon, on the one hand, we introduce a +Contrastive Instruction Evaluation Method (CIEM), which is an automatic +pipeline that leverages an annotated image-text dataset coupled with an LLM to +generate factual/contrastive question-answer pairs for the evaluation of the +hallucination of VLMs. On the other hand, based on CIEM, we further propose a +new instruction tuning method called CIT (the abbreviation of Contrastive +Instruction Tuning) to alleviate the hallucination of VLMs by automatically +producing high-quality factual/contrastive question-answer pairs and +corresponding justifications for model tuning. Through extensive experiments on +CIEM and CIT, we pinpoint the hallucination issues commonly present in existing +VLMs, the disability of the current instruction-tuning dataset to handle the +hallucination phenomenon and the superiority of CIT-tuned VLMs over both CIEM +and public datasets. + +
+
+
+
+
+ + ☆ ATM: Action Temporality Modeling for Video Question Answering + + +
+ Despite significant progress in video question answering (VideoQA), existing +methods fall short of questions that require causal/temporal reasoning across +frames. This can be attributed to imprecise motion representations. We +introduce Action Temporality Modeling (ATM) for temporality reasoning via +three-fold uniqueness: (1) rethinking the optical flow and realizing that +optical flow is effective in capturing the long horizon temporality reasoning; +(2) training the visual-text embedding by contrastive learning in an +action-centric manner, leading to better action representations in both vision +and text modalities; and (3) preventing the model from answering the question +given the shuffled video in the fine-tuning stage, to avoid spurious +correlation between appearance and motion and hence ensure faithful temporality +reasoning. In the experiments, we show that ATM outperforms previous approaches +in terms of the accuracy on multiple VideoQAs and exhibits better true +temporality reasoning ability. + +
+
+
+
+
+ + ☆ Haystack: A Panoptic Scene Graph Dataset to Evaluate Rare Predicate + Classes + + +
+ Current scene graph datasets suffer from strong long-tail distributions of +their predicate classes. Due to a very low number of some predicate classes in +the test sets, no reliable metrics can be retrieved for the rarest classes. We +construct a new panoptic scene graph dataset and a set of metrics that are +designed as a benchmark for the predictive performance especially on rare +predicate classes. To construct the new dataset, we propose a model-assisted +annotation pipeline that efficiently finds rare predicate classes that are +hidden in a large set of images like needles in a haystack. + Contrary to prior scene graph datasets, Haystack contains explicit negative +annotations, i.e. annotations that a given relation does not have a certain +predicate class. Negative annotations are helpful especially in the field of +scene graph generation and open up a whole new set of possibilities to improve +current scene graph generation models. + Haystack is 100% compatible with existing panoptic scene graph datasets and +can easily be integrated with existing evaluation pipelines. Our dataset and +code can be found here: https://lorjul.github.io/haystack/. It includes +annotation files and simple to use scripts and utilities, to help with +integrating our dataset in existing work. + +
+
+
+
+
+ + ☆ SAM-Deblur: Let Segment Anything Boost Image Deblurring + + +
+ Image deblurring is a critical task in the field of image restoration, aiming +to eliminate blurring artifacts. However, the challenge of addressing +non-uniform blurring leads to an ill-posed problem, which limits the +generalization performance of existing deblurring models. To solve the problem, +we propose a framework SAM-Deblur, integrating prior knowledge from the Segment +Anything Model (SAM) into the deblurring task for the first time. In +particular, SAM-Deblur is divided into three stages. First, We preprocess the +blurred images, obtain image masks via SAM, and propose a mask dropout method +for training to enhance model robustness. Then, to fully leverage the +structural priors generated by SAM, we propose a Mask Average Pooling (MAP) +unit specifically designed to average SAM-generated segmented areas, serving as +a plug-and-play component which can be seamlessly integrated into existing +deblurring networks. Finally, we feed the fused features generated by the MAP +Unit into the deblurring model to obtain a sharp image. Experimental results on +the RealBlurJ, ReloBlur, and REDS datasets reveal that incorporating our +methods improves NAFNet's PSNR by 0.05, 0.96, and 7.03, respectively. Code will +be available at \href{https://github.com/HPLQAQ/SAM-Deblur}{SAM-Deblur}. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Augmenting Chest X-ray Datasets with Non-Expert Annotations + + +
+ The advancement of machine learning algorithms in medical image analysis +requires the expansion of training datasets. A popular and cost-effective +approach is automated annotation extraction from free-text medical reports, +primarily due to the high costs associated with expert clinicians annotating +chest X-ray images. However, it has been shown that the resulting datasets are +susceptible to biases and shortcuts. Another strategy to increase the size of a +dataset is crowdsourcing, a widely adopted practice in general computer vision +with some success in medical image analysis. In a similar vein to +crowdsourcing, we enhance two publicly available chest X-ray datasets by +incorporating non-expert annotations. However, instead of using diagnostic +labels, we annotate shortcuts in the form of tubes. We collect 3.5k chest drain +annotations for CXR14, and 1k annotations for 4 different tube types in +PadChest. We train a chest drain detector with the non-expert annotations that +generalizes well to expert labels. Moreover, we compare our annotations to +those provided by experts and show "moderate" to "almost perfect" agreement. +Finally, we present a pathology agreement study to raise awareness about ground +truth annotations. We make our annotations and code available. + +
+
+
+
+
+ + ☆ DCP-Net: A Distributed Collaborative Perception Network for Remote + Sensing Semantic Segmentation + + +
+ Onboard intelligent processing is widely applied in emergency tasks in the +field of remote sensing. However, it is predominantly confined to an individual +platform with a limited observation range as well as susceptibility to +interference, resulting in limited accuracy. Considering the current state of +multi-platform collaborative observation, this article innovatively presents a +distributed collaborative perception network called DCP-Net. Firstly, the +proposed DCP-Net helps members to enhance perception performance by integrating +features from other platforms. Secondly, a self-mutual information match module +is proposed to identify collaboration opportunities and select suitable +partners, prioritizing critical collaborative features and reducing redundant +transmission cost. Thirdly, a related feature fusion module is designed to +address the misalignment between local and collaborative features, improving +the quality of fused features for the downstream task. We conduct extensive +experiments and visualization analyses using three semantic segmentation +datasets, including Potsdam, iSAID and DFC23. The results demonstrate that +DCP-Net outperforms the existing methods comprehensively, improving mIoU by +2.61%~16.89% at the highest collaboration efficiency, which promotes the +performance to a state-of-the-art level. + +
+
+
+
+
+ + ☆ Dense Object Grounding in 3D Scenes ACM MM 2023 + + +
+ Localizing objects in 3D scenes according to the semantics of a given natural +language is a fundamental yet important task in the field of multimedia +understanding, which benefits various real-world applications such as robotics +and autonomous driving. However, the majority of existing 3D object grounding +methods are restricted to a single-sentence input describing an individual +object, which cannot comprehend and reason more contextualized descriptions of +multiple objects in more practical 3D cases. To this end, we introduce a new +challenging task, called 3D Dense Object Grounding (3D DOG), to jointly +localize multiple objects described in a more complicated paragraph rather than +a single sentence. Instead of naively localizing each sentence-guided object +independently, we found that dense objects described in the same paragraph are +often semantically related and spatially located in a focused region of the 3D +scene. To explore such semantic and spatial relationships of densely referred +objects for more accurate localization, we propose a novel Stacked Transformer +based framework for 3D DOG, named 3DOGSFormer. Specifically, we first devise a +contextual query-driven local transformer decoder to generate initial grounding +proposals for each target object. Then, we employ a proposal-guided global +transformer decoder that exploits the local object features to learn their +correlation for further refining initial grounding proposals. Extensive +experiments on three challenging benchmarks (Nr3D, Sr3D, and ScanRefer) show +that our proposed 3DOGSFormer outperforms state-of-the-art 3D single-object +grounding methods and their dense-object variants by significant margins. + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ☆ Robustness and Generalizability of Deepfake Detection: A Study with + Diffusion Models + + +
+ The rise of deepfake images, especially of well-known personalities, poses a +serious threat to the dissemination of authentic information. To tackle this, +we present a thorough investigation into how deepfakes are produced and how +they can be identified. The cornerstone of our research is a rich collection of +artificial celebrity faces, titled DeepFakeFace (DFF). We crafted the DFF +dataset using advanced diffusion models and have shared it with the community +through online platforms. This data serves as a robust foundation to train and +test algorithms designed to spot deepfakes. We carried out a thorough review of +the DFF dataset and suggest two evaluation methods to gauge the strength and +adaptability of deepfake recognition tools. The first method tests whether an +algorithm trained on one type of fake images can recognize those produced by +other methods. The second evaluates the algorithm's performance with imperfect +images, like those that are blurry, of low quality, or compressed. Given varied +results across deepfake methods and image changes, our findings stress the need +for better deepfake detectors. Our DFF dataset and tests aim to boost the +development of more effective tools against deepfakes. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Advanced Underwater Image Restoration in Complex Illumination Conditions + + +
+ Underwater image restoration has been a challenging problem for decades since +the advent of underwater photography. Most solutions focus on shallow water +scenarios, where the scene is uniformly illuminated by the sunlight. However, +the vast majority of uncharted underwater terrain is located beyond 200 meters +depth where natural light is scarce and artificial illumination is needed. In +such cases, light sources co-moving with the camera, dynamically change the +scene appearance, which make shallow water restoration methods inadequate. In +particular for multi-light source systems (composed of dozens of LEDs +nowadays), calibrating each light is time-consuming, error-prone and tedious, +and we observe that only the integrated illumination within the viewing volume +of the camera is critical, rather than the individual light sources. The key +idea of this paper is therefore to exploit the appearance changes of objects or +the seafloor, when traversing the viewing frustum of the camera. Through new +constraints assuming Lambertian surfaces, corresponding image pixels constrain +the light field in front of the camera, and for each voxel a signal factor and +a backscatter value are stored in a volumetric grid that can be used for very +efficient image restoration of camera-light platforms, which facilitates +consistently texturing large 3D models and maps that would otherwise be +dominated by lighting and medium artifacts. To validate the effectiveness of +our approach, we conducted extensive experiments on simulated and real-world +datasets. The results of these experiments demonstrate the robustness of our +approach in restoring the true albedo of objects, while mitigating the +influence of lighting and medium effects. Furthermore, we demonstrate our +approach can be readily extended to other scenarios, including in-air imaging +with artificial illumination or other similar cases. + +
+
+
+
+
+ + ☆ Continual Cross-Dataset Adaptation in Road Surface Classification SC 2023 + + +
+ Accurate road surface classification is crucial for autonomous vehicles (AVs) +to optimize driving conditions, enhance safety, and enable advanced road +mapping. However, deep learning models for road surface classification suffer +from poor generalization when tested on unseen datasets. To update these models +with new information, also the original training dataset must be taken into +account, in order to avoid catastrophic forgetting. This is, however, +inefficient if not impossible, e.g., when the data is collected in streams or +large amounts. To overcome this limitation and enable fast and efficient +cross-dataset adaptation, we propose to employ continual learning finetuning +methods designed to retain past knowledge while adapting to new data, thus +effectively avoiding forgetting. Experimental results demonstrate the +superiority of this approach over naive finetuning, achieving performance close +to fresh retraining. While solving this known problem, we also provide a +general description of how the same technique can be adopted in other AV +scenarios. We highlight the potential computational and economic benefits that +a continual-based adaptation can bring to the AV industry, while also reducing +greenhouse emissions due to unnecessary joint retraining. + +
+
+ comment: To be published in Proceedings of 26th IEEE International Conference + on Intelligent Transportation Systems (ITSC 2023) +
+
+
+
+
+ + ☆ Delving into Ipsilateral Mammogram Assessment under Multi-View Network + + +
+ In many recent years, multi-view mammogram analysis has been focused widely +on AI-based cancer assessment. In this work, we aim to explore diverse fusion +strategies (average and concatenate) and examine the model's learning behavior +with varying individuals and fusion pathways, involving Coarse Layer and Fine +Layer. The Ipsilateral Multi-View Network, comprising five fusion types (Pre, +Early, Middle, Last, and Post Fusion) in ResNet-18, is employed. Notably, the +Middle Fusion emerges as the most balanced and effective approach, enhancing +deep-learning models' generalization performance by +5.29\% (concatenate) and ++5.9\% (average) in VinDr-Mammo dataset and +2.03\% (concatenate) and +3\% +(average) in CMMD dataset on macro F1-Score. The paper emphasizes the crucial +role of layer assignment in multi-view network extraction with various +strategies. + +
+
+
+
+
+ + ☆ Exchanging-based Multimodal Fusion with Transformer + + +
+ We study the problem of multimodal fusion in this paper. Recent +exchanging-based methods have been proposed for vision-vision fusion, which aim +to exchange embeddings learned from one modality to the other. However, most of +them project inputs of multimodalities into different low-dimensional spaces +and cannot be applied to the sequential input data. To solve these issues, in +this paper, we propose a novel exchanging-based multimodal fusion model MuSE +for text-vision fusion based on Transformer. We first use two encoders to +separately map multimodal inputs into different low-dimensional spaces. Then we +employ two decoders to regularize the embeddings and pull them into the same +space. The two decoders capture the correlations between texts and images with +the image captioning task and the text-to-image generation task, respectively. +Further, based on the regularized embeddings, we present CrossTransformer, +which uses two Transformer encoders with shared parameters as the backbone +model to exchange knowledge between multimodalities. Specifically, +CrossTransformer first learns the global contextual information of the inputs +in the shallow layers. After that, it performs inter-modal exchange by +selecting a proportion of tokens in one modality and replacing their embeddings +with the average of embeddings in the other modality. We conduct extensive +experiments to evaluate the performance of MuSE on the Multimodal Named Entity +Recognition task and the Multimodal Sentiment Analysis task. Our results show +the superiority of MuSE against other competitors. Our code and data are +provided at https://github.com/RecklessRonan/MuSE. + +
+
+
+
+
+ + ☆ AniPortraitGAN: Animatable 3D Portrait Generation from 2D Image + Collections SIGGRAPH + + +
+ Previous animatable 3D-aware GANs for human generation have primarily focused +on either the human head or full body. However, head-only videos are relatively +uncommon in real life, and full body generation typically does not deal with +facial expression control and still has challenges in generating high-quality +results. Towards applicable video avatars, we present an animatable 3D-aware +GAN that generates portrait images with controllable facial expression, head +pose, and shoulder movements. It is a generative model trained on unstructured +2D image collections without using 3D or video data. For the new task, we base +our method on the generative radiance manifold representation and equip it with +learnable facial and head-shoulder deformations. A dual-camera rendering and +adversarial learning scheme is proposed to improve the quality of the generated +faces, which is critical for portrait images. A pose deformation processing +network is developed to generate plausible deformations for challenging regions +such as long hair. Experiments show that our method, trained on unstructured 2D +images, can generate diverse and high-quality 3D portraits with desired control +over different properties. + +
+
+ comment: SIGGRAPH Asia 2023. Project Page: + https://yuewuhkust.github.io/AniPortraitGAN/ +
+
+
+
+
+ + ☆ BEVTrack: A Simple Baseline for Point Cloud Tracking in Bird's-Eye-View + + +
+ 3D single object tracking (SOT) in point clouds is still a challenging +problem due to appearance variation, distractors, and high sparsity of point +clouds. Notably, in autonomous driving scenarios, the target object typically +maintains spatial adjacency across consecutive frames, predominantly moving +horizontally. This spatial continuity offers valuable prior knowledge for +target localization. However, existing trackers, which often employ point-wise +representations, struggle to efficiently utilize this knowledge owing to the +irregular format of such representations. Consequently, they require elaborate +designs and solving multiple subtasks to establish spatial correspondence. In +this paper, we introduce BEVTrack, a simple yet strong baseline framework for +3D SOT. After converting consecutive point clouds into the common +Bird's-Eye-View representation, BEVTrack inherently encodes spatial proximity +and adeptly captures motion cues for tracking via a simple element-wise +operation and convolutional layers. Additionally, to better deal with objects +having diverse sizes and moving patterns, BEVTrack directly learns the +underlying motion distribution rather than making a fixed Laplacian or Gaussian +assumption as in previous works. Without bells and whistles, BEVTrack achieves +state-of-the-art performance on KITTI and NuScenes datasets while maintaining a +high inference speed of 122 FPS. The code will be released at +https://github.com/xmm-prio/BEVTrack. + +
+
+ comment: Technical report. Work in progress. The code will be released at + https://github.com/xmm-prio/BEVTrack +
+
+
+
+
+ + ☆ High-resolution 3D Maps of Left Atrial Displacements using an + Unsupervised Image Registration Neural Network + + +
+ Functional analysis of the left atrium (LA) plays an increasingly important +role in the prognosis and diagnosis of cardiovascular diseases. +Echocardiography-based measurements of LA dimensions and strains are useful +biomarkers, but they provide an incomplete picture of atrial deformations. +High-resolution dynamic magnetic resonance images (Cine MRI) offer the +opportunity to examine LA motion and deformation in 3D, at higher spatial +resolution and with full LA coverage. However, there are no dedicated tools to +automatically characterise LA motion in 3D. Thus, we propose a tool that +automatically segments the LA and extracts the displacement fields across the +cardiac cycle. The pipeline is able to accurately track the LA wall across the +cardiac cycle with an average Hausdorff distance of $2.51 \pm 1.3~mm$ and Dice +score of $0.96 \pm 0.02$. + +
+
+
+
+
+ + ☆ Dual Relation Alignment for Composed Image Retrieval + + +
+ Composed image retrieval, a task involving the search for a target image +using a reference image and a complementary text as the query, has witnessed +significant advancements owing to the progress made in cross-modal modeling. +Unlike the general image-text retrieval problem with only one alignment +relation, i.e., image-text, we argue for the existence of two types of +relations in composed image retrieval. The explicit relation pertains to the +reference image & complementary text-target image, which is commonly exploited +by existing methods. Besides this intuitive relation, the observations during +our practice have uncovered another implicit yet crucial relation, i.e., +reference image & target image-complementary text, since we found that the +complementary text can be inferred by studying the relation between the target +image and the reference image. Regrettably, existing methods largely focus on +leveraging the explicit relation to learn their networks, while overlooking the +implicit relation. In response to this weakness, We propose a new framework for +composed image retrieval, termed dual relation alignment, which integrates both +explicit and implicit relations to fully exploit the correlations among the +triplets. Specifically, we design a vision compositor to fuse reference image +and target image at first, then the resulted representation will serve two +roles: (1) counterpart for semantic alignment with the complementary text and +(2) compensation for the complementary text to boost the explicit relation +modeling, thereby implant the implicit relation into the alignment learning. +Our method is evaluated on two popular datasets, CIRR and FashionIQ, through +extensive experiments. The results confirm the effectiveness of our +dual-relation learning in substantially enhancing composed image retrieval +performance. + +
+
+
+
+
+ + ☆ PCFGaze: Physics-Consistent Feature for Appearance-based Gaze Estimation + + +
+ Although recent deep learning based gaze estimation approaches have achieved +much improvement, we still know little about how gaze features are connected to +the physics of gaze. In this paper, we try to answer this question by analyzing +the gaze feature manifold. Our analysis revealed the insight that the geodesic +distance between gaze features is consistent with the gaze differences between +samples. According to this finding, we construct the Physics- Consistent +Feature (PCF) in an analytical way, which connects gaze feature to the physical +definition of gaze. We further propose the PCFGaze framework that directly +optimizes gaze feature space by the guidance of PCF. Experimental results +demonstrate that the proposed framework alleviates the overfitting problem and +significantly improves cross-domain gaze estimation accuracy without extra +training data. The insight of gaze feature has the potential to benefit other +regression tasks with physical meanings. + +
+
+
+
+
+ + ☆ The Adversarial Implications of Variable-Time Inference + + +
+ Machine learning (ML) models are known to be vulnerable to a number of +attacks that target the integrity of their predictions or the privacy of their +training data. To carry out these attacks, a black-box adversary must typically +possess the ability to query the model and observe its outputs (e.g., labels). +In this work, we demonstrate, for the first time, the ability to enhance such +decision-based attacks. To accomplish this, we present an approach that +exploits a novel side channel in which the adversary simply measures the +execution time of the algorithm used to post-process the predictions of the ML +model under attack. The leakage of inference-state elements into algorithmic +timing side channels has never been studied before, and we have found that it +can contain rich information that facilitates superior timing attacks that +significantly outperform attacks based solely on label outputs. In a case +study, we investigate leakage from the non-maximum suppression (NMS) algorithm, +which plays a crucial role in the operation of object detectors. In our +examination of the timing side-channel vulnerabilities associated with this +algorithm, we identified the potential to enhance decision-based attacks. We +demonstrate attacks against the YOLOv3 detector, leveraging the timing leakage +to successfully evade object detection using adversarial examples, and perform +dataset inference. Our experiments show that our adversarial examples exhibit +superior perturbation quality compared to a decision-based attack. In addition, +we present a new threat model in which dataset inference based solely on timing +leakage is performed. To address the timing leakage vulnerability inherent in +the NMS algorithm, we explore the potential and limitations of implementing +constant-time inference passes as a mitigation strategy. + +
+
+
+
+
+ + ☆ Traffic Light Recognition using Convolutional Neural Networks: A Survey SC2023 + + +
+ Real-time traffic light recognition is essential for autonomous driving. Yet, +a cohesive overview of the underlying model architectures for this task is +currently missing. In this work, we conduct a comprehensive survey and analysis +of traffic light recognition methods that use convolutional neural networks +(CNNs). We focus on two essential aspects: datasets and CNN architectures. +Based on an underlying architecture, we cluster methods into three major +groups: (1) modifications of generic object detectors which compensate for +specific task characteristics, (2) multi-stage approaches involving both +rule-based and CNN components, and (3) task-specific single-stage methods. We +describe the most important works in each cluster, discuss the usage of the +datasets, and identify research gaps. + +
+
+ comment: Accepted for publication at ITSC2023 +
+
+
+
+
+ + ☆ S3C: Semi-Supervised VQA Natural Language Explanation via Self-Critical + Learning CVPR2023 + + +
+ VQA Natural Language Explanation (VQA-NLE) task aims to explain the +decision-making process of VQA models in natural language. Unlike traditional +attention or gradient analysis, free-text rationales can be easier to +understand and gain users' trust. Existing methods mostly use post-hoc or +self-rationalization models to obtain a plausible explanation. However, these +frameworks are bottlenecked by the following challenges: 1) the reasoning +process cannot be faithfully responded to and suffer from the problem of +logical inconsistency. 2) Human-annotated explanations are expensive and +time-consuming to collect. In this paper, we propose a new Semi-Supervised +VQA-NLE via Self-Critical Learning (S3C), which evaluates the candidate +explanations by answering rewards to improve the logical consistency between +answers and rationales. With a semi-supervised learning framework, the S3C can +benefit from a tremendous amount of samples without human-annotated +explanations. A large number of automatic measures and human evaluations all +show the effectiveness of our method. Meanwhile, the framework achieves a new +state-of-the-art performance on the two VQA-NLE datasets. + +
+
+ comment: CVPR2023 +
+
+
+
+
+ + ☆ Domain Adaptation for Satellite-Borne Hyperspectral Cloud Detection + + +
+ The advent of satellite-borne machine learning hardware accelerators has +enabled the on-board processing of payload data using machine learning +techniques such as convolutional neural networks (CNN). A notable example is +using a CNN to detect the presence of clouds in hyperspectral data captured on +Earth observation (EO) missions, whereby only clear sky data is downlinked to +conserve bandwidth. However, prior to deployment, new missions that employ new +sensors will not have enough representative datasets to train a CNN model, +while a model trained solely on data from previous missions will underperform +when deployed to process the data on the new missions. This underperformance +stems from the domain gap, i.e., differences in the underlying distributions of +the data generated by the different sensors in previous and future missions. In +this paper, we address the domain gap problem in the context of on-board +hyperspectral cloud detection. Our main contributions lie in formulating new +domain adaptation tasks that are motivated by a concrete EO mission, developing +a novel algorithm for bandwidth-efficient supervised domain adaptation, and +demonstrating test-time adaptation algorithms on space deployable neural +network accelerators. Our contributions enable minimal data transmission to be +invoked (e.g., only 1% of the weights in ResNet50) to achieve domain +adaptation, thereby allowing more sophisticated CNN models to be deployed and +updated on satellites without being hampered by domain gap and bandwidth +limitations. + +
+
+
+
+
+ + ☆ INCEPTNET: Precise And Early Disease Detection Application For Medical + Images Analyses + + +
+ In view of the recent paradigm shift in deep AI based image processing +methods, medical image processing has advanced considerably. In this study, we +propose a novel deep neural network (DNN), entitled InceptNet, in the scope of +medical image processing, for early disease detection and segmentation of +medical images in order to enhance precision and performance. We also +investigate the interaction of users with the InceptNet application to present +a comprehensive application including the background processes, and foreground +interactions with users. Fast InceptNet is shaped by the prominent Unet +architecture, and it seizes the power of an Inception module to be fast and +cost effective while aiming to approximate an optimal local sparse structure. +Adding Inception modules with various parallel kernel sizes can improve the +network's ability to capture the variations in the scaled regions of interest. +To experiment, the model is tested on four benchmark datasets, including retina +blood vessel segmentation, lung nodule segmentation, skin lesion segmentation, +and breast cancer cell detection. The improvement was more significant on +images with small scale structures. The proposed method improved the accuracy +from 0.9531, 0.8900, 0.9872, and 0.9881 to 0.9555, 0.9510, 0.9945, and 0.9945 +on the mentioned datasets, respectively, which show outperforming of the +proposed method over the previous works. Furthermore, by exploring the +procedure from start to end, individuals who have utilized a trial edition of +InceptNet, in the form of a complete application, are presented with thirteen +multiple choice questions in order to assess the proposed method. The outcomes +are evaluated through the means of Human Computer Interaction. + +
+
+
+
+
+ + ☆ A Lightweight, Rapid and Efficient Deep Convolutional Network for Chest + X-Ray Tuberculosis Detection + + +
+ Tuberculosis (TB) is still recognized as one of the leading causes of death +worldwide. Recent advances in deep learning (DL) have shown to enhance +radiologists' ability to interpret chest X-ray (CXR) images accurately and with +fewer errors, leading to a better diagnosis of this disease. However, little +work has been done to develop models capable of diagnosing TB that offer good +performance while being efficient, fast and computationally inexpensive. In +this work, we propose LightTBNet, a novel lightweight, fast and efficient deep +convolutional network specially customized to detect TB from CXR images. Using +a total of 800 frontal CXR images from two publicly available datasets, our +solution yielded an accuracy, F1 and area under the ROC curve (AUC) of 0.906, +0.907 and 0.961, respectively, on an independent test subset. The proposed +model demonstrates outstanding performance while delivering a rapid prediction, +with minimal computational and memory requirements, making it highly suitable +for deployment in handheld devices that can be used in low-resource areas with +high TB prevalence. Code publicly available at +https://github.com/dani-capellan/LightTBNet. + +
+
+ comment: 5 pages, 3 figures, 3 tables. This paper has been accepted at ISBI + 2023 +
+
+
+
+
+ + ☆ Self-Supervised Pre-Training Boosts Semantic Scene Segmentation on LiDAR + data + + +
+ Airborne LiDAR systems have the capability to capture the Earth's surface by +generating extensive point cloud data comprised of points mainly defined by 3D +coordinates. However, labeling such points for supervised learning tasks is +time-consuming. As a result, there is a need to investigate techniques that can +learn from unlabeled data to significantly reduce the number of annotated +samples. In this work, we propose to train a self-supervised encoder with +Barlow Twins and use it as a pre-trained network in the task of semantic scene +segmentation. The experimental results demonstrate that our unsupervised +pre-training boosts performance once fine-tuned on the supervised task, +especially for under-represented categories. + +
+
+ comment: International conference Machine Vision Applications 2023 +
+
+
+
+
+ + ☆ Multi-label affordance mapping from egocentric vision ICCV + + +
+ Accurate affordance detection and segmentation with pixel precision is an +important piece in many complex systems based on interactions, such as robots +and assitive devices. We present a new approach to affordance perception which +enables accurate multi-label segmentation. Our approach can be used to +automatically extract grounded affordances from first person videos of +interactions using a 3D map of the environment providing pixel level precision +for the affordance location. We use this method to build the largest and most +complete dataset on affordances based on the EPIC-Kitchen dataset, EPIC-Aff, +which provides interaction-grounded, multi-label, metric and spatial affordance +annotations. Then, we propose a new approach to affordance segmentation based +on multi-label detection which enables multiple affordances to co-exists in the +same space, for example if they are associated with the same object. We present +several strategies of multi-label detection using several segmentation +architectures. The experimental results highlight the importance of the +multi-label detection. Finally, we show how our metric representation can be +exploited for build a map of interaction hotspots in spatial action-centric +zones and use that representation to perform a task-oriented navigation. + +
+
+ comment: International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Hierarchical Masked 3D Diffusion Model for Video Outpainting ACM MM 2023 + + +
+ Video outpainting aims to adequately complete missing areas at the edges of +video frames. Compared to image outpainting, it presents an additional +challenge as the model should maintain the temporal consistency of the filled +area. In this paper, we introduce a masked 3D diffusion model for video +outpainting. We use the technique of mask modeling to train the 3D diffusion +model. This allows us to use multiple guide frames to connect the results of +multiple video clip inferences, thus ensuring temporal consistency and reducing +jitter between adjacent frames. Meanwhile, we extract the global frames of the +video as prompts and guide the model to obtain information other than the +current video clip using cross-attention. We also introduce a hybrid +coarse-to-fine inference pipeline to alleviate the artifact accumulation +problem. The existing coarse-to-fine pipeline only uses the infilling strategy, +which brings degradation because the time interval of the sparse frames is too +large. Our pipeline benefits from bidirectional learning of the mask modeling +and thus can employ a hybrid strategy of infilling and interpolation when +generating sparse frames. Experiments show that our method achieves +state-of-the-art results in video outpainting tasks. More results are provided +at our https://fanfanda.github.io/M3DDM/. + +
+
+ comment: ACM MM 2023 accepted +
+
+
+
+
+ + ☆ Iterative Superquadric Recomposition of 3D Objects from Multiple Views ICCV 2023 + + +
+ Humans are good at recomposing novel objects, i.e. they can identify +commonalities between unknown objects from general structure to finer detail, +an ability difficult to replicate by machines. We propose a framework, ISCO, to +recompose an object using 3D superquadrics as semantic parts directly from 2D +views without training a model that uses 3D supervision. To achieve this, we +optimize the superquadric parameters that compose a specific instance of the +object, comparing its rendered 3D view and 2D image silhouette. Our ISCO +framework iteratively adds new superquadrics wherever the reconstruction error +is high, abstracting first coarse regions and then finer details of the target +object. With this simple coarse-to-fine inductive bias, ISCO provides +consistent superquadrics for related object parts, despite not having any +semantic supervision. Since ISCO does not train any neural network, it is also +inherently robust to out-of-distribution objects. Experiments show that, +compared to recent single instance superquadrics reconstruction approaches, +ISCO provides consistently more accurate 3D reconstructions, even from images +in the wild. Code available at https://github.com/ExplainableML/ISCO . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Towards Diverse and Consistent Typography Generation + + +
+ In this work, we consider the typography generation task that aims at +producing diverse typographic styling for the given graphic document. We +formulate typography generation as a fine-grained attribute generation for +multiple text elements and build an autoregressive model to generate diverse +typography that matches the input design context. We further propose a simple +yet effective sampling approach that respects the consistency and distinction +principle of typography so that generated examples share consistent typographic +styling across text elements. Our empirical study shows that our model +successfully generates diverse typographic designs while preserving a +consistent typographic structure. + +
+
+
+
+
+ + ☆ DeNISE: Deep Networks for Improved Segmentation Edges + + +
+ This paper presents Deep Networks for Improved Segmentation Edges (DeNISE), a +novel data enhancement technique using edge detection and segmentation models +to improve the boundary quality of segmentation masks. DeNISE utilizes the +inherent differences in two sequential deep neural architectures to improve the +accuracy of the predicted segmentation edge. DeNISE applies to all types of +neural networks and is not trained end-to-end, allowing rapid experiments to +discover which models complement each other. We test and apply DeNISE for +building segmentation in aerial images. Aerial images are known for difficult +conditions as they have a low resolution with optical noise, such as +reflections, shadows, and visual obstructions. Overall the paper demonstrates +the potential for DeNISE. Using the technique, we improve the baseline results +with a building IoU of 78.9%. + +
+
+
+
+
+ + ☆ Dual Adversarial Alignment for Realistic Support-Query Shift Few-shot + Learning PAKDD 2022 + + +
+ Support-query shift few-shot learning aims to classify unseen examples (query +set) to labeled data (support set) based on the learned embedding in a +low-dimensional space under a distribution shift between the support set and +the query set. However, in real-world scenarios the shifts are usually unknown +and varied, making it difficult to estimate in advance. Therefore, in this +paper, we propose a novel but more difficult challenge, RSQS, focusing on +Realistic Support-Query Shift few-shot learning. The key feature of RSQS is +that the individual samples in a meta-task are subjected to multiple +distribution shifts in each meta-task. In addition, we propose a unified +adversarial feature alignment method called DUal adversarial ALignment +framework (DuaL) to relieve RSQS from two aspects, i.e., inter-domain bias and +intra-domain variance. On the one hand, for the inter-domain bias, we corrupt +the original data in advance and use the synthesized perturbed inputs to train +the repairer network by minimizing distance in the feature level. On the other +hand, for intra-domain variance, we proposed a generator network to synthesize +hard, i.e., less similar, examples from the support set in a self-supervised +manner and introduce regularized optimal transportation to derive a smooth +optimal transportation plan. Lastly, a benchmark of RSQS is built with several +state-of-the-art baselines among three datasets (CIFAR100, mini-ImageNet, and +Tiered-Imagenet). Experiment results show that DuaL significantly outperforms +the state-of-the-art methods in our benchmark. + +
+
+ comment: Best student paper in PAKDD 2022 +
+
+
+
+
+ + ☆ Histograms of Points, Orientations, and Dynamics of Orientations + Features for Hindi Online Handwritten Character Recognition + + +
+ A set of features independent of character stroke direction and order +variations is proposed for online handwritten character recognition. A method +is developed that maps features like co-ordinates of points, orientations of +strokes at points, and dynamics of orientations of strokes at points spatially +as a function of co-ordinate values of the points and computes histograms of +these features from different regions in the spatial map. + Different features like spatio-temporal, discrete Fourier transform, discrete +cosine transform, discrete wavelet transform, spatial, and histograms of +oriented gradients used in other studies for training classifiers for character +recognition are considered. The classifier chosen for classification +performance comparison, when trained with different features, is support vector +machines (SVM). + The character datasets used for training and testing the classifiers consist +of online handwritten samples of 96 different Hindi characters. There are 12832 +and 2821 samples in training and testing datasets, respectively. + SVM classifiers trained with the proposed features has the highest +classification accuracy of 92.9\% when compared to the performances of SVM +classifiers trained with the other features and tested on the same testing +dataset. Therefore, the proposed features have better character discriminative +capability than the other features considered for comparison. + +
+
+ comment: 21 pages, 12 jpg figures +
+
+
+
+
+ + ☆ An Adaptive Spatial-Temporal Local Feature Difference Method for + Infrared Small-moving Target Detection + + +
+ Detecting small moving targets accurately in infrared (IR) image sequences is +a significant challenge. To address this problem, we propose a novel method +called spatial-temporal local feature difference (STLFD) with adaptive +background suppression (ABS). Our approach utilizes filters in the spatial and +temporal domains and performs pixel-level ABS on the output to enhance the +contrast between the target and the background. The proposed method comprises +three steps. First, we obtain three temporal frame images based on the current +frame image and extract two feature maps using the designed spatial domain and +temporal domain filters. Next, we fuse the information of the spatial domain +and temporal domain to produce the spatial-temporal feature maps and suppress +noise using our pixel-level ABS module. Finally, we obtain the segmented binary +map by applying a threshold. Our experimental results demonstrate that the +proposed method outperforms existing state-of-the-art methods for infrared +small-moving target detection. + +
+
+
+
+
+ + ☆ Diffusion-based 3D Object Detection with Random Boxes + + +
+ 3D object detection is an essential task for achieving autonomous driving. +Existing anchor-based detection methods rely on empirical heuristics setting of +anchors, which makes the algorithms lack elegance. In recent years, we have +witnessed the rise of several generative models, among which diffusion models +show great potential for learning the transformation of two distributions. Our +proposed Diff3Det migrates the diffusion model to proposal generation for 3D +object detection by considering the detection boxes as generative targets. +During training, the object boxes diffuse from the ground truth boxes to the +Gaussian distribution, and the decoder learns to reverse this noise process. In +the inference stage, the model progressively refines a set of random boxes to +the prediction results. We provide detailed experiments on the KITTI benchmark +and achieve promising performance compared to classical anchor-based 3D +detection methods. + +
+
+ comment: Accepted by PRCV 2023 +
+
+
+
+
+ + ☆ Decomposed Guided Dynamic Filters for Efficient RGB-Guided Depth + Completion + + +
+ RGB-guided depth completion aims at predicting dense depth maps from sparse +depth measurements and corresponding RGB images, where how to effectively and +efficiently exploit the multi-modal information is a key issue. Guided dynamic +filters, which generate spatially-variant depth-wise separable convolutional +filters from RGB features to guide depth features, have been proven to be +effective in this task. However, the dynamically generated filters require +massive model parameters, computational costs and memory footprints when the +number of feature channels is large. In this paper, we propose to decompose the +guided dynamic filters into a spatially-shared component multiplied by +content-adaptive adaptors at each spatial location. Based on the proposed idea, +we introduce two decomposition schemes A and B, which decompose the filters by +splitting the filter structure and using spatial-wise attention, respectively. +The decomposed filters not only maintain the favorable properties of guided +dynamic filters as being content-dependent and spatially-variant, but also +reduce model parameters and hardware costs, as the learned adaptors are +decoupled with the number of feature channels. Extensive experimental results +demonstrate that the methods using our schemes outperform state-of-the-art +methods on the KITTI dataset, and rank 1st and 2nd on the KITTI benchmark at +the time of submission. Meanwhile, they also achieve comparable performance on +the NYUv2 dataset. In addition, our proposed methods are general and could be +employed as plug-and-play feature fusion blocks in other multi-modal fusion +tasks such as RGB-D salient object detection. + +
+
+
+
+
+ + ☆ Learning Cross-Modal Affinity for Referring Video Object Segmentation + Targeting Limited Samples ICCV2023 + + +
+ Referring video object segmentation (RVOS), as a supervised learning task, +relies on sufficient annotated data for a given scene. However, in more +realistic scenarios, only minimal annotations are available for a new scene, +which poses significant challenges to existing RVOS methods. With this in mind, +we propose a simple yet effective model with a newly designed cross-modal +affinity (CMA) module based on a Transformer architecture. The CMA module +builds multimodal affinity with a few samples, thus quickly learning new +semantic information, and enabling the model to adapt to different scenarios. +Since the proposed method targets limited samples for new scenes, we generalize +the problem as - few-shot referring video object segmentation (FS-RVOS). To +foster research in this direction, we build up a new FS-RVOS benchmark based on +currently available datasets. The benchmark covers a wide range and includes +multiple situations, which can maximally simulate real-world scenarios. +Extensive experiments show that our model adapts well to different scenarios +with only a few samples, reaching state-of-the-art performance on the +benchmark. On Mini-Ref-YouTube-VOS, our model achieves an average performance +of 53.1 J and 54.8 F, which are 10% better than the baselines. Furthermore, we +show impressive results of 77.7 J and 74.8 F on Mini-Ref-SAIL-VOS, which are +significantly better than the baselines. Code is publicly available at +https://github.com/hengliusky/Few_shot_RVOS. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ A survey on efficient vision transformers: algorithms, techniques, and + performance benchmarking + + +
+ Vision Transformer (ViT) architectures are becoming increasingly popular and +widely employed to tackle computer vision applications. Their main feature is +the capacity to extract global information through the self-attention +mechanism, outperforming earlier convolutional neural networks. However, ViT +deployment and performance have grown steadily with their size, number of +trainable parameters, and operations. Furthermore, self-attention's +computational and memory cost quadratically increases with the image +resolution. Generally speaking, it is challenging to employ these architectures +in real-world applications due to many hardware and environmental restrictions, +such as processing and computational capabilities. Therefore, this survey +investigates the most efficient methodologies to ensure sub-optimal estimation +performances. More in detail, four efficient categories will be analyzed: +compact architecture, pruning, knowledge distillation, and quantization +strategies. Moreover, a new metric called Efficient Error Rate has been +introduced in order to normalize and compare models' features that affect +hardware devices at inference time, such as the number of parameters, bits, +FLOPs, and model size. Summarizing, this paper firstly mathematically defines +the strategies used to make Vision Transformer efficient, describes and +discusses state-of-the-art methodologies, and analyzes their performances over +different application scenarios. Toward the end of this paper, we also discuss +open challenges and promising research directions. + +
+
+
+
+
+ + ☆ Dynamic Early Exiting Predictive Coding Neural Networks + + +
+ Internet of Things (IoT) sensors are nowadays heavily utilized in various +real-world applications ranging from wearables to smart buildings passing by +agrotechnology and health monitoring. With the huge amounts of data generated +by these tiny devices, Deep Learning (DL) models have been extensively used to +enhance them with intelligent processing. However, with the urge for smaller +and more accurate devices, DL models became too heavy to deploy. It is thus +necessary to incorporate the hardware's limited resources in the design +process. Therefore, inspired by the human brain known for its efficiency and +low power consumption, we propose a shallow bidirectional network based on +predictive coding theory and dynamic early exiting for halting further +computations when a performance threshold is surpassed. We achieve comparable +accuracy to VGG-16 in image classification on CIFAR-10 with fewer parameters +and less computational complexity. + +
+
+
+
+
+ + ☆ RawHDR: High Dynamic Range Image Reconstruction from a Single Raw Image ICCV 2023 + + +
+ High dynamic range (HDR) images capture much more intensity levels than +standard ones. Current methods predominantly generate HDR images from 8-bit low +dynamic range (LDR) sRGB images that have been degraded by the camera +processing pipeline. However, it becomes a formidable task to retrieve +extremely high dynamic range scenes from such limited bit-depth data. Unlike +existing methods, the core idea of this work is to incorporate more informative +Raw sensor data to generate HDR images, aiming to recover scene information in +hard regions (the darkest and brightest areas of an HDR scene). To this end, we +propose a model tailor-made for Raw images, harnessing the unique features of +Raw data to facilitate the Raw-to-HDR mapping. Specifically, we learn exposure +masks to separate the hard and easy regions of a high dynamic scene. Then, we +introduce two important guidances, dual intensity guidance, which guides less +informative channels with more informative ones, and global spatial guidance, +which extrapolates scene specifics over an extended spatial domain. To verify +our Raw-to-HDR approach, we collect a large Raw/HDR paired dataset for both +training and testing. Our empirical evaluations validate the superiority of the +proposed Raw-to-HDR reconstruction model, as well as our newly captured dataset +in the experiments. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Logarithmic Mathematical Morphology: theory and applications + + +
+ Classically, in Mathematical Morphology, an image (i.e., a grey-level +function) is analysed by another image which is named the structuring element +or the structuring function. This structuring function is moved over the image +domain and summed to the image. However, in an image presenting lighting +variations, the analysis by a structuring function should require that its +amplitude varies according to the image intensity. Such a property is not +verified in Mathematical Morphology for grey level functions, when the +structuring function is summed to the image with the usual additive law. In +order to address this issue, a new framework is defined with an additive law +for which the amplitude of the structuring function varies according to the +image amplitude. This additive law is chosen within the Logarithmic Image +Processing framework and models the lighting variations with a physical cause +such as a change of light intensity or a change of camera exposure-time. The +new framework is named Logarithmic Mathematical Morphology (LMM) and allows the +definition of operators which are robust to such lighting variations. In images +with uniform lighting variations, those new LMM operators perform better than +usual morphological operators. In eye-fundus images with non-uniform lighting +variations, a LMM method for vessel segmentation is compared to three +state-of-the-art approaches. Results show that the LMM approach has a better +robustness to such variations than the three others. + +
+
+
+
+
+ + ☆ Analyzing domain shift when using additional data for the MICCAI KiTS23 + Challenge + + +
+ Using additional training data is known to improve the results, especially +for medical image 3D segmentation where there is a lack of training material +and the model needs to generalize well from few available data. However, the +new data could have been acquired using other instruments and preprocessed such +its distribution is significantly different from the original training data. +Therefore, we study techniques which ameliorate domain shift during training so +that the additional data becomes better usable for preprocessing and training +together with the original data. Our results show that transforming the +additional data using histogram matching has better results than using simple +normalization. + +
+
+ comment: This preprint has not undergone peer review or any post-submission + improvements or corrections. The Version of Record of this contribution is + published in [TODO], and is available online at https://doi.org/[TODO] +
+
+
+
+
+ + ☆ NICE 2023 Zero-shot Image Captioning Challenge + + +
+ In this report, we introduce NICE +project\footnote{\url{https://nice.lgresearch.ai/}} and share the results and +outcomes of NICE challenge 2023. This project is designed to challenge the +computer vision community to develop robust image captioning models that +advance the state-of-the-art both in terms of accuracy and fairness. Through +the challenge, the image captioning models were tested using a new evaluation +dataset that includes a large variety of visual concepts from many domains. +There was no specific training data provided for the challenge, and therefore +the challenge entries were required to adapt to new types of image descriptions +that had not been seen during training. This report includes information on the +newly proposed NICE dataset, evaluation methods, challenge results, and +technical details of top-ranking entries. We expect that the outcomes of the +challenge will contribute to the improvement of AI models on various +vision-language tasks. + +
+
+ comment: Tech report +
+
+
+
+
+ + ☆ Empowering Low-Light Image Enhancer through Customized Learnable Priors ICCV 2023 + + +
+ Deep neural networks have achieved remarkable progress in enhancing low-light +images by improving their brightness and eliminating noise. However, most +existing methods construct end-to-end mapping networks heuristically, +neglecting the intrinsic prior of image enhancement task and lacking +transparency and interpretability. Although some unfolding solutions have been +proposed to relieve these issues, they rely on proximal operator networks that +deliver ambiguous and implicit priors. In this work, we propose a paradigm for +low-light image enhancement that explores the potential of customized learnable +priors to improve the transparency of the deep unfolding paradigm. Motivated by +the powerful feature representation capability of Masked Autoencoder (MAE), we +customize MAE-based illumination and noise priors and redevelop them from two +perspectives: 1) \textbf{structure flow}: we train the MAE from a normal-light +image to its illumination properties and then embed it into the proximal +operator design of the unfolding architecture; and m2) \textbf{optimization +flow}: we train MAE from a normal-light image to its gradient representation +and then employ it as a regularization term to constrain noise in the model +output. These designs improve the interpretability and representation +capability of the model.Extensive experiments on multiple low-light image +enhancement datasets demonstrate the superiority of our proposed paradigm over +state-of-the-art methods. Code is available at +https://github.com/zheng980629/CUE. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ RADIO: Reference-Agnostic Dubbing Video Synthesis + + +
+ One of the most challenging problems in audio-driven talking head generation +is achieving high-fidelity detail while ensuring precise synchronization. Given +only a single reference image, extracting meaningful identity attributes +becomes even more challenging, often causing the network to mirror the facial +and lip structures too closely. To address these issues, we introduce RADIO, a +framework engineered to yield high-quality dubbed videos regardless of the pose +or expression in reference images. The key is to modulate the decoder layers +using latent space composed of audio and reference features. Additionally, we +incorporate ViT blocks into the decoder to emphasize high-fidelity details, +especially in the lip region. Our experimental results demonstrate that RADIO +displays high synchronization without the loss of fidelity. Especially in harsh +scenarios where the reference frame deviates significantly from the ground +truth, our method outperforms state-of-the-art methods, highlighting its +robustness. Pre-trained model and codes will be made public after the review. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Efficient Bayesian Computational Imaging with a Surrogate Score-Based + Prior + + +
+ We propose a surrogate function for efficient use of score-based priors for +Bayesian inverse imaging. Recent work turned score-based diffusion models into +probabilistic priors for solving ill-posed imaging problems by appealing to an +ODE-based log-probability function. However, evaluating this function is +computationally inefficient and inhibits posterior estimation of +high-dimensional images. Our proposed surrogate prior is based on the evidence +lower-bound of a score-based diffusion model. We demonstrate the surrogate +prior on variational inference for efficient approximate posterior sampling of +large images. Compared to the exact prior in previous work, our surrogate prior +accelerates optimization of the variational image distribution by at least two +orders of magnitude. We also find that our principled approach achieves +higher-fidelity images than non-Bayesian baselines that involve +hyperparameter-tuning at inference. Our work establishes a practical path +forward for using score-based diffusion models as general-purpose priors for +imaging. + +
+
+
+
+
+ + ☆ Extract-and-Adaptation Network for 3D Interacting Hand Mesh Recovery ICCV + + +
+ Understanding how two hands interact with each other is a key component of +accurate 3D interacting hand mesh recovery. However, recent Transformer-based +methods struggle to learn the interaction between two hands as they directly +utilize two hand features as input tokens, which results in distant token +problem. The distant token problem represents that input tokens are in +heterogeneous spaces, leading Transformer to fail in capturing correlation +between input tokens. Previous Transformer-based methods suffer from the +problem especially when poses of two hands are very different as they project +features from a backbone to separate left and right hand-dedicated features. We +present EANet, extract-and-adaptation network, with EABlock, the main component +of our network. Rather than directly utilizing two hand features as input +tokens, our EABlock utilizes two complementary types of novel tokens, SimToken +and JoinToken, as input tokens. Our two novel tokens are from a combination of +separated two hand features; hence, it is much more robust to the distant token +problem. Using the two type of tokens, our EABlock effectively extracts +interaction feature and adapts it to each hand. The proposed EANet achieves the +state-of-the-art performance on 3D interacting hands benchmarks. The codes are +available at https://github.com/jkpark0825/EANet. + +
+
+ comment: Accepted at ICCVW 2023 +
+
+
+
+
+ + ☆ DR-Pose: A Two-stage Deformation-and-Registration Pipeline for + Category-level 6D Object Pose Estimation IROS 2023 + + +
+ Category-level object pose estimation involves estimating the 6D pose and the +3D metric size of objects from predetermined categories. While recent +approaches take categorical shape prior information as reference to improve +pose estimation accuracy, the single-stage network design and training manner +lead to sub-optimal performance since there are two distinct tasks in the +pipeline. In this paper, the advantage of two-stage pipeline over single-stage +design is discussed. To this end, we propose a two-stage deformation-and +registration pipeline called DR-Pose, which consists of completion-aided +deformation stage and scaled registration stage. The first stage uses a point +cloud completion method to generate unseen parts of target object, guiding +subsequent deformation on the shape prior. In the second stage, a novel +registration network is designed to extract pose-sensitive features and predict +the representation of object partial point cloud in canonical space based on +the deformation results from the first stage. DR-Pose produces superior results +to the state-of-the-art shape prior-based methods on both CAMERA25 and REAL275 +benchmarks. Codes are available at https://github.com/Zray26/DR-Pose.git. + +
+
+ comment: Camera-ready version accepted to IROS 2023 +
+
+
+
+
+ + ☆ Causal Scoring Medical Image Explanations: A Case Study On Ex-vivo + Kidney Stone Images + + +
+ On the promise that if human users know the cause of an output, it would +enable them to grasp the process responsible for the output, and hence provide +understanding, many explainable methods have been proposed to indicate the +cause for the output of a model based on its input. Nonetheless, little has +been reported on quantitative measurements of such causal relationships between +the inputs, the explanations, and the outputs of a model, leaving the +assessment to the user, independent of his level of expertise in the subject. +To address this situation, we explore a technique for measuring the causal +relationship between the features from the area of the object of interest in +the images of a class and the output of a classifier. Our experiments indicate +improvement in the causal relationships measured when the area of the object of +interest per class is indicated by a mask from an explainable method than when +it is indicated by human annotators. Hence the chosen name of Causal +Explanation Score (CaES) + +
+
+
+
+
+ + ☆ SyntheWorld: A Large-Scale Synthetic Dataset for Land Cover Mapping and + Building Change Detection WACV 2024 + + +
+ Synthetic datasets, recognized for their cost effectiveness, play a pivotal +role in advancing computer vision tasks and techniques. However, when it comes +to remote sensing image processing, the creation of synthetic datasets becomes +challenging due to the demand for larger-scale and more diverse 3D models. This +complexity is compounded by the difficulties associated with real remote +sensing datasets, including limited data acquisition and high annotation costs, +which amplifies the need for high-quality synthetic alternatives. To address +this, we present SyntheWorld, a synthetic dataset unparalleled in quality, +diversity, and scale. It includes 40,000 images with submeter-level pixels and +fine-grained land cover annotations of eight categories, and it also provides +40,000 pairs of bitemporal image pairs with building change annotations for +building change detection task. We conduct experiments on multiple benchmark +remote sensing datasets to verify the effectiveness of SyntheWorld and to +investigate the conditions under which our synthetic data yield advantages. We +will release SyntheWorld to facilitate remote sensing image processing +research. + +
+
+ comment: Accepted by WACV 2024 +
+
+
+
+
+ + ☆ Improving Drone Imagery For Computer Vision/Machine Learning in + Wilderness Search and Rescue + + +
+ This paper describes gaps in acquisition of drone imagery that impair the use +with computer vision/machine learning (CV/ML) models and makes five +recommendations to maximize image suitability for CV/ML post-processing. It +describes a notional work process for the use of drones in wilderness search +and rescue incidents. The large volume of data from the wide area search phase +offers the greatest opportunity for CV/ML techniques because of the large +number of images that would otherwise have to be manually inspected. The 2023 +Wu-Murad search in Japan, one of the largest missing person searches conducted +in that area, serves as a case study. Although drone teams conducting wide area +searches may not know in advance if the data they collect is going to be used +for CV/ML post-processing, there are data collection procedures that can +improve the search in general with automated collection software. If the drone +teams do expect to use CV/ML, then they can exploit knowledge about the model +to further optimize flights. + +
+
+ comment: 6 pages, 4 figures +
+
+
+
+
+ + ☆ Towards Robust Plant Disease Diagnosis with Hard-sample Re-mining + Strategy + + +
+ With rich annotation information, object detection-based automated plant +disease diagnosis systems (e.g., YOLO-based systems) often provide advantages +over classification-based systems (e.g., EfficientNet-based), such as the +ability to detect disease locations and superior classification performance. +One drawback of these detection systems is dealing with unannotated healthy +data with no real symptoms present. In practice, healthy plant data appear to +be very similar to many disease data. Thus, those models often produce +mis-detected boxes on healthy images. In addition, labeling new data for +detection models is typically time-consuming. Hard-sample mining (HSM) is a +common technique for re-training a model by using the mis-detected boxes as new +training samples. However, blindly selecting an arbitrary amount of hard-sample +for re-training will result in the degradation of diagnostic performance for +other diseases due to the high similarity between disease and healthy data. In +this paper, we propose a simple but effective training strategy called +hard-sample re-mining (HSReM), which is designed to enhance the diagnostic +performance of healthy data and simultaneously improve the performance of +disease data by strategically selecting hard-sample training images at an +appropriate level. Experiments based on two practical in-field eight-class +cucumber and ten-class tomato datasets (42.7K and 35.6K images) show that our +HSReM training strategy leads to a substantial improvement in the overall +diagnostic performance on large-scale unseen data. Specifically, the object +detection model trained using the HSReM strategy not only achieved superior +results as compared to the classification-based state-of-the-art +EfficientNetV2-Large model and the original object detection model, but also +outperformed the model using the HSM strategy. + +
+
+
+
+
+ + ☆ Unsupervised Skin Lesion Segmentation via Structural Entropy + Minimization on Multi-Scale Superpixel Graphs ICDM 2023 + + +
+ Skin lesion segmentation is a fundamental task in dermoscopic image analysis. +The complex features of pixels in the lesion region impede the lesion +segmentation accuracy, and existing deep learning-based methods often lack +interpretability to this problem. In this work, we propose a novel unsupervised +Skin Lesion sEgmentation framework based on structural entropy and isolation +forest outlier Detection, namely SLED. Specifically, skin lesions are segmented +by minimizing the structural entropy of a superpixel graph constructed from the +dermoscopic image. Then, we characterize the consistency of healthy skin +features and devise a novel multi-scale segmentation mechanism by outlier +detection, which enhances the segmentation accuracy by leveraging the +superpixel features from multiple scales. We conduct experiments on four skin +lesion benchmarks and compare SLED with nine representative unsupervised +segmentation methods. Experimental results demonstrate the superiority of the +proposed framework. Additionally, some case studies are analyzed to demonstrate +the effectiveness of SLED. + +
+
+ comment: 10 pages, 8 figures, conference. Accepted by IEEE ICDM 2023 +
+
+
+
+
+ + ☆ Gradient Domain Diffusion Models for Image Synthesis + + +
+ Diffusion models are getting popular in generative image and video synthesis. +However, due to the diffusion process, they require a large number of steps to +converge. To tackle this issue, in this paper, we propose to perform the +diffusion process in the gradient domain, where the convergence becomes faster. +There are two reasons. First, thanks to the Poisson equation, the gradient +domain is mathematically equivalent to the original image domain. Therefore, +each diffusion step in the image domain has a unique corresponding gradient +domain representation. Second, the gradient domain is much sparser than the +image domain. As a result, gradient domain diffusion models converge faster. +Several numerical experiments confirm that the gradient domain diffusion models +are more efficient than the original diffusion models. The proposed method can +be applied in a wide range of applications such as image processing, computer +vision and machine learning tasks. + +
+
+
+
+
+ + ☆ Compressing Vision Transformers for Low-Resource Visual Learning + + +
+ Vision transformer (ViT) and its variants have swept through visual learning +leaderboards and offer state-of-the-art accuracy in tasks such as image +classification, object detection, and semantic segmentation by attending to +different parts of the visual input and capturing long-range spatial +dependencies. However, these models are large and computation-heavy. For +instance, the recently proposed ViT-B model has 86M parameters making it +impractical for deployment on resource-constrained devices. As a result, their +deployment on mobile and edge scenarios is limited. In our work, we aim to take +a step toward bringing vision transformers to the edge by utilizing popular +model compression techniques such as distillation, pruning, and quantization. + Our chosen application environment is an unmanned aerial vehicle (UAV) that +is battery-powered and memory-constrained, carrying a single-board computer on +the scale of an NVIDIA Jetson Nano with 4GB of RAM. On the other hand, the UAV +requires high accuracy close to that of state-of-the-art ViTs to ensure safe +object avoidance in autonomous navigation, or correct localization of humans in +search-and-rescue. Inference latency should also be minimized given the +application requirements. Hence, our target is to enable rapid inference of a +vision transformer on an NVIDIA Jetson Nano (4GB) with minimal accuracy loss. +This allows us to deploy ViTs on resource-constrained devices, opening up new +possibilities in surveillance, environmental monitoring, etc. Our +implementation is made available at https://github.com/chensy7/efficient-vit. + +
+
+
+
+
+ + ☆ Self-Supervised Pretraining Improves Performance and Inference + Efficiency in Multiple Lung Ultrasound Interpretation Tasks + + +
+ In this study, we investigated whether self-supervised pretraining could +produce a neural network feature extractor applicable to multiple +classification tasks in B-mode lung ultrasound analysis. When fine-tuning on +three lung ultrasound tasks, pretrained models resulted in an improvement of +the average across-task area under the receiver operating curve (AUC) by 0.032 +and 0.061 on local and external test sets respectively. Compact nonlinear +classifiers trained on features outputted by a single pretrained model did not +improve performance across all tasks; however, they did reduce inference time +by 49% compared to serial execution of separate fine-tuned models. When +training using 1% of the available labels, pretrained models consistently +outperformed fully supervised models, with a maximum observed test AUC increase +of 0.396 for the task of view classification. Overall, the results indicate +that self-supervised pretraining is useful for producing initial weights for +lung ultrasound classifiers. + +
+
+ comment: 10 pages, 5 figures, submitted to IEEE Access +
+
+
+
+
+ + ☆ Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction + Tuning + + +
+ We present CM3Leon (pronounced "Chameleon"), a retrieval-augmented, +token-based, decoder-only multi-modal language model capable of generating and +infilling both text and images. CM3Leon uses the CM3 multi-modal architecture +but additionally shows the extreme benefits of scaling up and tuning on more +diverse instruction-style data. It is the first multi-modal model trained with +a recipe adapted from text-only language models, including a large-scale +retrieval-augmented pre-training stage and a second multi-task supervised +fine-tuning (SFT) stage. It is also a general-purpose model that can do both +text-to-image and image-to-text generation, allowing us to introduce +self-contained contrastive decoding methods that produce high-quality outputs. +Extensive experiments demonstrate that this recipe is highly effective for +multi-modal models. CM3Leon achieves state-of-the-art performance in +text-to-image generation with 5x less training compute than comparable methods +(zero-shot MS-COCO FID of 4.88). After SFT, CM3Leon can also demonstrate +unprecedented levels of controllability in tasks ranging from language-guided +image editing to image-controlled generation and segmentation. + +
+
+
+
+
+ + ☆ Anatomy-Driven Pathology Detection on Chest X-rays MICCAI 2023 + + +
+ Pathology detection and delineation enables the automatic interpretation of +medical scans such as chest X-rays while providing a high level of +explainability to support radiologists in making informed decisions. However, +annotating pathology bounding boxes is a time-consuming task such that large +public datasets for this purpose are scarce. Current approaches thus use weakly +supervised object detection to learn the (rough) localization of pathologies +from image-level annotations, which is however limited in performance due to +the lack of bounding box supervision. We therefore propose anatomy-driven +pathology detection (ADPD), which uses easy-to-annotate bounding boxes of +anatomical regions as proxies for pathologies. We study two training +approaches: supervised training using anatomy-level pathology labels and +multiple instance learning (MIL) with image-level pathology labels. Our results +show that our anatomy-level training approach outperforms weakly supervised +methods and fully supervised detection with limited training samples, and our +MIL approach is competitive with both baseline approaches, therefore +demonstrating the potential of our approach. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ☆ Emphysema Subtyping on Thoracic Computed Tomography Scans using Deep + Neural Networks + + +
+ Accurate identification of emphysema subtypes and severity is crucial for +effective management of COPD and the study of disease heterogeneity. Manual +analysis of emphysema subtypes and severity is laborious and subjective. To +address this challenge, we present a deep learning-based approach for +automating the Fleischner Society's visual score system for emphysema subtyping +and severity analysis. We trained and evaluated our algorithm using 9650 +subjects from the COPDGene study. Our algorithm achieved the predictive +accuracy at 52\%, outperforming a previously published method's accuracy of +45\%. In addition, the agreement between the predicted scores of our method and +the visual scores was good, where the previous method obtained only moderate +agreement. Our approach employs a regression training strategy to generate +categorical labels while simultaneously producing high-resolution localized +activation maps for visualizing the network predictions. By leveraging these +dense activation maps, our method possesses the capability to compute the +percentage of emphysema involvement per lung in addition to categorical +severity scores. Furthermore, the proposed method extends its predictive +capabilities beyond centrilobular emphysema to include paraseptal emphysema +subtypes. + +
+
+
+
+
+ + ☆ Evaluation Kidney Layer Segmentation on Whole Slide Imaging using + Convolutional Neural Networks and Transformers + + +
+ The segmentation of kidney layer structures, including cortex, outer stripe, +inner stripe, and inner medulla within human kidney whole slide images (WSI) +plays an essential role in automated image analysis in renal pathology. +However, the current manual segmentation process proves labor-intensive and +infeasible for handling the extensive digital pathology images encountered at a +large scale. In response, the realm of digital renal pathology has seen the +emergence of deep learning-based methodologies. However, very few, if any, deep +learning based approaches have been applied to kidney layer structure +segmentation. Addressing this gap, this paper assesses the feasibility of +performing deep learning based approaches on kidney layer structure +segmetnation. This study employs the representative convolutional neural +network (CNN) and Transformer segmentation approaches, including Swin-Unet, +Medical-Transformer, TransUNet, U-Net, PSPNet, and DeepLabv3+. We +quantitatively evaluated six prevalent deep learning models on renal cortex +layer segmentation using mice kidney WSIs. The empirical results stemming from +our approach exhibit compelling advancements, as evidenced by a decent Mean +Intersection over Union (mIoU) index. The results demonstrate that Transformer +models generally outperform CNN-based models. By enabling a quantitative +evaluation of renal cortical structures, deep learning approaches are promising +to empower these medical professionals to make more informed kidney layer +segmentation. + +
+
+
+
+
+ + ☆ Recurrence-Free Survival Prediction for Anal Squamous Cell Carcinoma + Chemoradiotherapy using Planning CT-based Radiomics Model + + +
+ Objectives: Approximately 30% of non-metastatic anal squamous cell carcinoma +(ASCC) patients will experience recurrence after chemoradiotherapy (CRT), and +currently available clinical variables are poor predictors of treatment +response. We aimed to develop a model leveraging information extracted from +radiation pretreatment planning CT to predict recurrence-free survival (RFS) in +ASCC patients after CRT. Methods: Radiomics features were extracted from +planning CT images of 96 ASCC patients. Following pre-feature selection, the +optimal feature set was selected via step-forward feature selection with a +multivariate Cox proportional hazard model. The RFS prediction was generated +from a radiomics-clinical combined model based on an optimal feature set with +five repeats of five-fold cross validation. The risk stratification ability of +the proposed model was evaluated with Kaplan-Meier analysis. Results: Shape- +and texture-based radiomics features significantly predicted RFS. Compared to a +clinical-only model, radiomics-clinical combined model achieves better +performance in the testing cohort with higher C-index (0.80 vs 0.73) and AUC +(0.84 vs 0.79 for 1-year RFS, 0.84 vs 0.78 for 2-year RFS, and 0.86 vs 0.83 for +3-year RFS), leading to distinctive high- and low-risk of recurrence groups +(p<0.001). Conclusions: A treatment planning CT based radiomics and clinical +combined model had improved prognostic performance in predicting RFS for ASCC +patients treated with CRT as compared to a model using clinical features only. + +
+
+
+
+
+ + ☆ Physically Grounded Vision-Language Models for Robotic Manipulation + + +
+ Recent advances in vision-language models (VLMs) have led to improved +performance on tasks such as visual question answering and image captioning. +Consequently, these models are now well-positioned to reason about the physical +world, particularly within domains such as robotic manipulation. However, +current VLMs are limited in their understanding of the physical concepts (e.g., +material, fragility) of common objects, which restricts their usefulness for +robotic manipulation tasks that involve interaction and physical reasoning +about such objects. To address this limitation, we propose PhysObjects, an +object-centric dataset of 36.9K crowd-sourced and 417K automated physical +concept annotations of common household objects. We demonstrate that +fine-tuning a VLM on PhysObjects improves its understanding of physical object +concepts, by capturing human priors of these concepts from visual appearance. +We incorporate this physically-grounded VLM in an interactive framework with a +large language model-based robotic planner, and show improved planning +performance on tasks that require reasoning about physical object concepts, +compared to baselines that do not leverage physically-grounded VLMs. We +additionally illustrate the benefits of our physically-grounded VLM on a real +robot, where it improves task success rates. We release our dataset and provide +further details and visualizations of our results at +https://iliad.stanford.edu/pg-vlm/. + +
+
+
+
+
+ + ☆ Domain Adaptation for Efficiently Fine-tuning Vision Transformer with + Encrypted Images + + +
+ In recent years, deep neural networks (DNNs) trained with transformed data +have been applied to various applications such as privacy-preserving learning, +access control, and adversarial defenses. However, the use of transformed data +decreases the performance of models. Accordingly, in this paper, we propose a +novel method for fine-tuning models with transformed images under the use of +the vision transformer (ViT). The proposed domain adaptation method does not +cause the accuracy degradation of models, and it is carried out on the basis of +the embedding structure of ViT. In experiments, we confirmed that the proposed +method prevents accuracy degradation even when using encrypted images with the +CIFAR-10 and CIFAR-100 datasets. + +
+
+ comment: Accepted by APSIPA 2023 +
+
+
+
+
+ + ☆ A Survey of the Impact of Self-Supervised Pretraining for Diagnostic + Tasks with Radiological Images + + +
+ Self-supervised pretraining has been observed to be effective at improving +feature representations for transfer learning, leveraging large amounts of +unlabelled data. This review summarizes recent research into its usage in +X-ray, computed tomography, magnetic resonance, and ultrasound imaging, +concentrating on studies that compare self-supervised pretraining to fully +supervised learning for diagnostic tasks such as classification and +segmentation. The most pertinent finding is that self-supervised pretraining +generally improves downstream task performance compared to full supervision, +most prominently when unlabelled examples greatly outnumber labelled examples. +Based on the aggregate evidence, recommendations are provided for practitioners +considering using self-supervised learning. Motivated by limitations identified +in current research, directions and practices for future study are suggested, +such as integrating clinical knowledge with theoretically justified +self-supervised learning methods, evaluating on public datasets, growing the +modest body of evidence for ultrasound, and characterizing the impact of +self-supervised pretraining on generalization. + +
+
+ comment: 32 pages, 6 figures, a literature survey submitted to BMC Medical + Imaging +
+
+
+
+
+ + ☆ A skeletonization algorithm for gradient-based optimization ICCV 2023 + + +
+ The skeleton of a digital image is a compact representation of its topology, +geometry, and scale. It has utility in many computer vision applications, such +as image description, segmentation, and registration. However, skeletonization +has only seen limited use in contemporary deep learning solutions. Most +existing skeletonization algorithms are not differentiable, making it +impossible to integrate them with gradient-based optimization. Compatible +algorithms based on morphological operations and neural networks have been +proposed, but their results often deviate from the geometry and topology of the +true medial axis. This work introduces the first three-dimensional +skeletonization algorithm that is both compatible with gradient-based +optimization and preserves an object's topology. Our method is exclusively +based on matrix additions and multiplications, convolutional operations, basic +non-linear functions, and sampling from a uniform probability distribution, +allowing it to be easily implemented in any major deep learning library. In +benchmarking experiments, we prove the advantages of our skeletonization +algorithm compared to non-differentiable, morphological, and +neural-network-based baselines. Finally, we demonstrate the utility of our +algorithm by integrating it with two medical image processing applications that +use gradient-based optimization: deep-learning-based blood vessel segmentation, +and multimodal registration of the mandible in computed tomography and magnetic +resonance images. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ V1T: large-scale mouse V1 response prediction using a Vision Transformer + + +
+ Accurate predictive models of the visual cortex neural response to natural +visual stimuli remain a challenge in computational neuroscience. In this work, +we introduce V1T, a novel Vision Transformer based architecture that learns a +shared visual and behavioral representation across animals. We evaluate our +model on two large datasets recorded from mouse primary visual cortex and +outperform previous convolution-based models by more than 12.7% in prediction +performance. Moreover, we show that the self-attention weights learned by the +Transformer correlate with the population receptive fields. Our model thus sets +a new benchmark for neural response prediction and can be used jointly with +behavioral and neural recordings to reveal meaningful characteristic features +of the visual cortex. + +
+
+ comment: updated references and added link to code repository; add analysis on + generalization and visualize aRFs; updated with TMLR publication +
+
+
+
+
+ + ♻ ☆ Colonoscopy 3D Video Dataset with Paired Depth from 2D-3D Registration + + +
+ Screening colonoscopy is an important clinical application for several 3D +computer vision techniques, including depth estimation, surface reconstruction, +and missing region detection. However, the development, evaluation, and +comparison of these techniques in real colonoscopy videos remain largely +qualitative due to the difficulty of acquiring ground truth data. In this work, +we present a Colonoscopy 3D Video Dataset (C3VD) acquired with a high +definition clinical colonoscope and high-fidelity colon models for benchmarking +computer vision methods in colonoscopy. We introduce a novel multimodal 2D-3D +registration technique to register optical video sequences with ground truth +rendered views of a known 3D model. The different modalities are registered by +transforming optical images to depth maps with a Generative Adversarial Network +and aligning edge features with an evolutionary optimizer. This registration +method achieves an average translation error of 0.321 millimeters and an +average rotation error of 0.159 degrees in simulation experiments where +error-free ground truth is available. The method also leverages video +information, improving registration accuracy by 55.6% for translation and 60.4% +for rotation compared to single frame registration. 22 short video sequences +were registered to generate 10,015 total frames with paired ground truth depth, +surface normals, optical flow, occlusion, six degree-of-freedom pose, coverage +maps, and 3D models. The dataset also includes screening videos acquired by a +gastroenterologist with paired ground truth pose and 3D surface models. The +dataset and registration source code are available at durr.jhu.edu/C3VD. + +
+
+
+
+
+ + ♻ ☆ Point-SLAM: Dense Neural Point Cloud-based SLAM ICCV 2023 + + +
+ We propose a dense neural simultaneous localization and mapping (SLAM) +approach for monocular RGBD input which anchors the features of a neural scene +representation in a point cloud that is iteratively generated in an +input-dependent data-driven manner. We demonstrate that both tracking and +mapping can be performed with the same point-based neural scene representation +by minimizing an RGBD-based re-rendering loss. In contrast to recent dense +neural SLAM methods which anchor the scene features in a sparse grid, our +point-based approach allows dynamically adapting the anchor point density to +the information density of the input. This strategy reduces runtime and memory +usage in regions with fewer details and dedicates higher point density to +resolve fine details. Our approach performs either better or competitive to +existing dense neural RGBD SLAM methods in tracking, mapping and rendering +accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source code is +available at https://github.com/eriksandstroem/Point-SLAM. + +
+
+ comment: ICCV 2023. 18 Pages, 12 Figures +
+
+
+
+
+ + ♻ ☆ 2nd Place Winning Solution for the CVPR2023 Visual Anomaly and Novelty + Detection Challenge: Multimodal Prompting for Data-centric Anomaly Detection CVPR + + +
+ This technical report introduces the winning solution of the team Segment Any +Anomaly for the CVPR2023 Visual Anomaly and Novelty Detection (VAND) challenge. +Going beyond uni-modal prompt, e.g., language prompt, we present a novel +framework, i.e., Segment Any Anomaly + (SAA$+$), for zero-shot anomaly +segmentation with multi-modal prompts for the regularization of cascaded modern +foundation models. Inspired by the great zero-shot generalization ability of +foundation models like Segment Anything, we first explore their assembly (SAA) +to leverage diverse multi-modal prior knowledge for anomaly localization. +Subsequently, we further introduce multimodal prompts (SAA$+$) derived from +domain expert knowledge and target image context to enable the non-parameter +adaptation of foundation models to anomaly segmentation. The proposed SAA$+$ +model achieves state-of-the-art performance on several anomaly segmentation +benchmarks, including VisA and MVTec-AD, in the zero-shot setting. We will +release the code of our winning solution for the CVPR2023 VAN. + +
+
+ comment: The first two author contribute equally. CVPR workshop challenge + report. arXiv admin note: substantial text overlap with arXiv:2305.10724 +
+
+
+
+
+ + ♻ ☆ Event-based Stereo Visual Odometry with Native Temporal Resolution via + Continuous-time Gaussian Process Regression + + +
+ Event-based cameras asynchronously capture individual visual changes in a +scene. This makes them more robust than traditional frame-based cameras to +highly dynamic motions and poor illumination. It also means that every +measurement in a scene can occur at a unique time. + Handling these different measurement times is a major challenge of using +event-based cameras. It is often addressed in visual odometry (VO) pipelines by +approximating temporally close measurements as occurring at one common time. +This grouping simplifies the estimation problem but, absent additional sensors, +sacrifices the inherent temporal resolution of event-based cameras. + This paper instead presents a complete stereo VO pipeline that estimates +directly with individual event-measurement times without requiring any grouping +or approximation in the estimation state. It uses continuous-time trajectory +estimation to maintain the temporal fidelity and asynchronous nature of +event-based cameras through Gaussian process regression with a physically +motivated prior. Its performance is evaluated on the MVSEC dataset, where it +achieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences, +outperforming the existing publicly available event-based stereo VO pipeline by +two and four times, respectively. + +
+
+ comment: To appear in IEEE Robotics and Automation Letters (RA-L). 8 pages, 4 + figures. DOI: 10.1109/LRA.2023.3311374 +
+
+
+
+
+ + ♻ ☆ Dynamic Loss For Robust Learning + + +
+ Label noise and class imbalance commonly coexist in real-world data. Previous +works for robust learning, however, usually address either one type of the data +biases and underperform when facing them both. To mitigate this gap, this work +presents a novel meta-learning based dynamic loss that automatically adjusts +the objective functions with the training process to robustly learn a +classifier from long-tailed noisy data. Concretely, our dynamic loss comprises +a label corrector and a margin generator, which respectively correct noisy +labels and generate additive per-class classification margins by perceiving the +underlying data distribution as well as the learning state of the classifier. +Equipped with a new hierarchical sampling strategy that enriches a small amount +of unbiased metadata with diverse and hard samples, the two components in the +dynamic loss are optimized jointly through meta-learning and cultivate the +classifier to well adapt to clean and balanced test data. Extensive experiments +show our method achieves state-of-the-art accuracy on multiple real-world and +synthetic datasets with various types of data biases, including CIFAR-10/100, +Animal-10N, ImageNet-LT, and Webvision. Code will soon be publicly available. + +
+
+
+
+
+ + ♻ ☆ When Measures are Unreliable: Imperceptible Adversarial Perturbations + toward Top-$k$ Multi-Label Learning ACM MM 2023 + + +
+ With the great success of deep neural networks, adversarial learning has +received widespread attention in various studies, ranging from multi-class +learning to multi-label learning. However, existing adversarial attacks toward +multi-label learning only pursue the traditional visual imperceptibility but +ignore the new perceptible problem coming from measures such as Precision@$k$ +and mAP@$k$. Specifically, when a well-trained multi-label classifier performs +far below the expectation on some samples, the victim can easily realize that +this performance degeneration stems from attack, rather than the model itself. +Therefore, an ideal multi-labeling adversarial attack should manage to not only +deceive visual perception but also evade monitoring of measures. To this end, +this paper first proposes the concept of measure imperceptibility. Then, a +novel loss function is devised to generate such adversarial perturbations that +could achieve both visual and measure imperceptibility. Furthermore, an +efficient algorithm, which enjoys a convex objective, is established to +optimize this objective. Finally, extensive experiments on large-scale +benchmark datasets, such as PASCAL VOC 2012, MS COCO, and NUS WIDE, demonstrate +the superiority of our proposed method in attacking the top-$k$ multi-label +systems. + +
+
+ comment: 22 pages, 7 figures, accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Discriminative Deep Feature Visualization for Explainable Face + Recognition + + +
+ Despite the huge success of deep convolutional neural networks in face +recognition (FR) tasks, current methods lack explainability for their +predictions because of their "black-box" nature. In recent years, studies have +been carried out to give an interpretation of the decision of a deep FR system. +However, the affinity between the input facial image and the extracted deep +features has not been explored. This paper contributes to the problem of +explainable face recognition by first conceiving a face reconstruction-based +explanation module, which reveals the correspondence between the deep feature +and the facial regions. To further interpret the decision of an FR model, a +novel visual saliency explanation algorithm has been proposed. It provides +insightful explanation by producing visual saliency maps that represent similar +and dissimilar regions between input faces. A detailed analysis has been +presented for the generated visual explanation to show the effectiveness of the +proposed method. + +
+
+
+
+
+ + ♻ ☆ Cross-resolution Face Recognition via Identity-Preserving Network and + Knowledge Distillation + + +
+ Cross-resolution face recognition has become a challenging problem for modern +deep face recognition systems. It aims at matching a low-resolution probe image +with high-resolution gallery images registered in a database. Existing methods +mainly leverage prior information from high-resolution images by either +reconstructing facial details with super-resolution techniques or learning a +unified feature space. To address this challenge, this paper proposes a new +approach that enforces the network to focus on the discriminative information +stored in the low-frequency components of a low-resolution image. A +cross-resolution knowledge distillation paradigm is first employed as the +learning framework. Then, an identity-preserving network, WaveResNet, and a +wavelet similarity loss are designed to capture low-frequency details and boost +performance. Finally, an image degradation model is conceived to simulate more +realistic low-resolution training data. Consequently, extensive experimental +results show that the proposed method consistently outperforms the baseline +model and other state-of-the-art methods across a variety of image resolutions. + +
+
+
+
+
+ + ♻ ☆ Automated GI tract segmentation using deep learning + + +
+ The job of Radiation oncologists is to deliver x-ray beams pointed toward the +tumor and at the same time avoid the stomach and intestines. With MR-Linacs +(magnetic resonance imaging and linear accelerator systems), oncologists can +visualize the position of the tumor and allow for precise dose according to +tumor cell presence which can vary from day to day. The current job of +outlining the position of the stomach and intestines to adjust the X-ray beams +direction for the dose delivery to the tumor while avoiding the organs. This is +a time-consuming and labor-intensive process that can easily prolong treatments +from 15 minutes to an hour a day unless deep learning methods can automate the +segmentation process. This paper discusses an automated segmentation process +using deep learning to make this process faster and allow more patients to get +effective treatment. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Introspective Deep Metric Learning for Image Retrieval + + +
+ This paper proposes an introspective deep metric learning (IDML) framework +for uncertainty-aware comparisons of images. Conventional deep metric learning +methods produce confident semantic distances between images regardless of the +uncertainty level. However, we argue that a good similarity model should +consider the semantic discrepancies with caution to better deal with ambiguous +images for more robust training. To achieve this, we propose to represent an +image using not only a semantic embedding but also an accompanying uncertainty +embedding, which describes the semantic characteristics and ambiguity of an +image, respectively. We further propose an introspective similarity metric to +make similarity judgments between images considering both their semantic +differences and ambiguities. The proposed IDML framework improves the +performance of deep metric learning through uncertainty modeling and attains +state-of-the-art results on the widely used CUB-200-2011, Cars196, and Stanford +Online Products datasets for image retrieval and clustering. We further provide +an in-depth analysis of our framework to demonstrate the effectiveness and +reliability of IDML. Code is available at: https://github.com/wzzheng/IDML. + +
+
+ comment: The extended version of this paper is accepted to T-PAMI. Source code + available at https://github.com/wzzheng/IDML +
+
+
+
+
+ + ♻ ☆ SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera + Videos ICCV 2023 + + +
+ Camera-based 3D object detection in BEV (Bird's Eye View) space has drawn +great attention over the past few years. Dense detectors typically follow a +two-stage pipeline by first constructing a dense BEV feature and then +performing object detection in BEV space, which suffers from complex view +transformations and high computation cost. On the other side, sparse detectors +follow a query-based paradigm without explicit dense BEV feature construction, +but achieve worse performance than the dense counterparts. In this paper, we +find that the key to mitigate this performance gap is the adaptability of the +detector in both BEV and image space. To achieve this goal, we propose +SparseBEV, a fully sparse 3D object detector that outperforms the dense +counterparts. SparseBEV contains three key designs, which are (1) +scale-adaptive self attention to aggregate features with adaptive receptive +field in BEV space, (2) adaptive spatio-temporal sampling to generate sampling +locations under the guidance of queries, and (3) adaptive mixing to decode the +sampled features with dynamic weights from the queries. On the test split of +nuScenes, SparseBEV achieves the state-of-the-art performance of 67.5 NDS. On +the val split, SparseBEV achieves 55.8 NDS while maintaining a real-time +inference speed of 23.5 FPS. Code is available at +https://github.com/MCG-NJU/SparseBEV. + +
+
+ comment: Accepted to ICCV 2023. This version fixes some typos +
+
+
+
+
+ + ♻ ☆ Calibrated Out-of-Distribution Detection with a Generic Representation ICCV 2023 + + +
+ Out-of-distribution detection is a common issue in deploying vision models in +practice and solving it is an essential building block in safety critical +applications. Most of the existing OOD detection solutions focus on improving +the OOD robustness of a classification model trained exclusively on +in-distribution (ID) data. In this work, we take a different approach and +propose to leverage generic pre-trained representation. We propose a novel OOD +method, called GROOD, that formulates the OOD detection as a Neyman-Pearson +task with well calibrated scores and which achieves excellent performance, +predicated by the use of a good generic representation. Only a trivial training +process is required for adapting GROOD to a particular problem. The method is +simple, general, efficient, calibrated and with only a few hyper-parameters. +The method achieves state-of-the-art performance on a number of OOD benchmarks, +reaching near perfect performance on several of them. The source code is +available at https://github.com/vojirt/GROOD. + +
+
+ comment: 10 pages, accepted to Workshop on Uncertainty Quantification for + Computer Vision, ICCV 2023 +
+
+
+
+
+ + ♻ ☆ LFS-GAN: Lifelong Few-Shot Image Generation ICCV 2023 + + +
+ We address a challenging lifelong few-shot image generation task for the +first time. In this situation, a generative model learns a sequence of tasks +using only a few samples per task. Consequently, the learned model encounters +both catastrophic forgetting and overfitting problems at a time. Existing +studies on lifelong GANs have proposed modulation-based methods to prevent +catastrophic forgetting. However, they require considerable additional +parameters and cannot generate high-fidelity and diverse images from limited +data. On the other hand, the existing few-shot GANs suffer from severe +catastrophic forgetting when learning multiple tasks. To alleviate these +issues, we propose a framework called Lifelong Few-Shot GAN (LFS-GAN) that can +generate high-quality and diverse images in lifelong few-shot image generation +task. Our proposed framework learns each task using an efficient task-specific +modulator - Learnable Factorized Tensor (LeFT). LeFT is rank-constrained and +has a rich representation ability due to its unique reconstruction technique. +Furthermore, we propose a novel mode seeking loss to improve the diversity of +our model in low-data circumstances. Extensive experiments demonstrate that the +proposed LFS-GAN can generate high-fidelity and diverse images without any +forgetting and mode collapse in various domains, achieving state-of-the-art in +lifelong few-shot image generation task. Surprisingly, we find that our LFS-GAN +even outperforms the existing few-shot GANs in the few-shot image generation +task. The code is available at Github. + +
+
+ comment: 20 pages, 19 figures, 14 tables, ICCV 2023 Poster +
+
+
+
+
+ + ♻ ☆ BAGM: A Backdoor Attack for Manipulating Text-to-Image Generative Models + + +
+ The rise in popularity of text-to-image generative artificial intelligence +(AI) has attracted widespread public interest. We demonstrate that this +technology can be attacked to generate content that subtly manipulates its +users. We propose a Backdoor Attack on text-to-image Generative Models (BAGM), +which upon triggering, infuses the generated images with manipulative details +that are naturally blended in the content. Our attack is the first to target +three popular text-to-image generative models across three stages of the +generative process by modifying the behaviour of the embedded tokenizer, the +language model or the image generative model. Based on the penetration level, +BAGM takes the form of a suite of attacks that are referred to as surface, +shallow and deep attacks in this article. Given the existing gap within this +domain, we also contribute a comprehensive set of quantitative metrics designed +specifically for assessing the effectiveness of backdoor attacks on +text-to-image models. The efficacy of BAGM is established by attacking +state-of-the-art generative models, using a marketing scenario as the target +domain. To that end, we contribute a dataset of branded product images. Our +embedded backdoors increase the bias towards the target outputs by more than +five times the usual, without compromising the model robustness or the +generated content utility. By exposing generative AI's vulnerabilities, we +encourage researchers to tackle these challenges and practitioners to exercise +caution when using pre-trained models. Relevant code, input prompts and +supplementary material can be found at https://github.com/JJ-Vice/BAGM, and the +dataset is available at: +https://ieee-dataport.org/documents/marketable-foods-mf-dataset. + Keywords: Generative Artificial Intelligence, Generative Models, +Text-to-Image generation, Backdoor Attacks, Trojan, Stable Diffusion. + +
+
+ comment: This research was supported by National Intelligence and Security + Discovery Research Grants (project# NS220100007), funded by the Department of + Defence Australia +
+
+
+
+
+ + ♻ ☆ Multi-level Multiple Instance Learning with Transformer for Whole Slide + Image Classification + + +
+ Whole slide image (WSI) refers to a type of high-resolution scanned tissue +image, which is extensively employed in computer-assisted diagnosis (CAD). The +extremely high resolution and limited availability of region-level annotations +make employing deep learning methods for WSI-based digital diagnosis +challenging. Recently integrating multiple instance learning (MIL) and +Transformer for WSI analysis shows very promising results. However, designing +effective Transformers for this weakly-supervised high-resolution image +analysis is an underexplored yet important problem. In this paper, we propose a +Multi-level MIL (MMIL) scheme by introducing a hierarchical structure to MIL, +which enables efficient handling of MIL tasks involving a large number of +instances. Based on MMIL, we instantiated MMIL-Transformer, an efficient +Transformer model with windowed exact self-attention for large-scale MIL tasks. +To validate its effectiveness, we conducted a set of experiments on WSI +classification tasks, where MMIL-Transformer demonstrate superior performance +compared to existing state-of-the-art methods, i.e., 96.80% test AUC and 97.67% +test accuracy on the CAMELYON16 dataset, 99.04% test AUC and 94.37% test +accuracy on the TCGA-NSCLC dataset, respectively. All code and pre-trained +models are available at: https://github.com/hustvl/MMIL-Transformer + +
+
+
+
+
+ + ♻ ☆ Shape of my heart: Cardiac models through learned signed distance + functions + + +
+ The efficient construction of an anatomical model is one of the major +challenges of patient-specific in-silico models of the human heart. Current +methods frequently rely on linear statistical models, allowing no advanced +topological changes, or requiring medical image segmentation followed by a +meshing pipeline, which strongly depends on image resolution, quality, and +modality. These approaches are therefore limited in their transferability to +other imaging domains. In this work, the cardiac shape is reconstructed by +means of three-dimensional deep signed distance functions with Lipschitz +regularity. For this purpose, the shapes of cardiac MRI reconstructions are +learned from public databases to model the spatial relation of multiple +chambers in Cartesian space. We demonstrate that this approach is also capable +of reconstructing anatomical models from partial data, such as point clouds +from a single ventricle, or modalities different from the trained MRI, such as +electroanatomical mapping, and in addition, allows us to generate new +anatomical shapes by randomly sampling latent vectors. + +
+
+
+
+
+ + ♻ ☆ Multi-manifold Attention for Vision Transformers + + +
+ Vision Transformers are very popular nowadays due to their state-of-the-art +performance in several computer vision tasks, such as image classification and +action recognition. Although their performance has been greatly enhanced +through highly descriptive patch embeddings and hierarchical structures, there +is still limited research on utilizing additional data representations so as to +refine the selfattention map of a Transformer. To address this problem, a novel +attention mechanism, called multi-manifold multihead attention, is proposed in +this work to substitute the vanilla self-attention of a Transformer. The +proposed mechanism models the input space in three distinct manifolds, namely +Euclidean, Symmetric Positive Definite and Grassmann, thus leveraging different +statistical and geometrical properties of the input for the computation of a +highly descriptive attention map. In this way, the proposed attention mechanism +can guide a Vision Transformer to become more attentive towards important +appearance, color and texture features of an image, leading to improved +classification and segmentation results, as shown by the experimental results +on well-known datasets. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Learning Multiscale Consistency for Self-supervised Electron Microscopy + Instance Segmentation + + +
+ Instance segmentation in electron microscopy (EM) volumes is tough due to +complex shapes and sparse annotations. Self-supervised learning helps but still +struggles with intricate visual patterns in EM. To address this, we propose a +pretraining framework that enhances multiscale consistency in EM volumes. Our +approach leverages a Siamese network architecture, integrating both strong and +weak data augmentations to effectively extract multiscale features. We uphold +voxel-level coherence by reconstructing the original input data from these +augmented instances. Furthermore, we incorporate cross-attention mechanisms to +facilitate fine-grained feature alignment between these augmentations. Finally, +we apply contrastive learning techniques across a feature pyramid, allowing us +to distill distinctive representations spanning various scales. After +pretraining on four large-scale EM datasets, our framework significantly +improves downstream tasks like neuron and mitochondria segmentation, especially +with limited finetuning data. It effectively captures voxel and feature +consistency, showing promise for learning transferable representations for EM +analysis. + +
+
+
+
+
+ + ♻ ☆ MetaWeather: Few-Shot Weather-Degraded Image Restoration via Degradation + Pattern Matching + + +
+ Real-world vision tasks frequently suffer from the appearance of adverse +weather conditions including rain, fog, snow, and raindrops in captured images. +Recently, several generic methods for restoring weather-degraded images have +been proposed, aiming to remove multiple types of adverse weather effects +present in the images. However, these methods have considered weather as +discrete and mutually exclusive variables, leading to failure in generalizing +to unforeseen weather conditions beyond the scope of the training data, such as +the co-occurrence of rain, fog, and raindrops. To this end, weather-degraded +image restoration models should have flexible adaptability to the current +unknown weather condition to ensure reliable and optimal performance. The +adaptation method should also be able to cope with data scarcity for real-world +adaptation. This paper proposes MetaWeather, a few-shot weather-degraded image +restoration method for arbitrary weather conditions. For this, we devise the +core piece of MetaWeather, coined Degradation Pattern Matching Module (DPMM), +which leverages representations from a few-shot support set by matching +features between input and sample images under new weather conditions. In +addition, we build meta-knowledge with episodic meta-learning on top of our +MetaWeather architecture to provide flexible adaptability. In the meta-testing +phase, we adopt a parameter-efficient fine-tuning method to preserve the +prebuilt knowledge and avoid the overfitting problem. Experiments on the BID +Task II.A dataset show our method achieves the best performance on PSNR and +SSIM compared to state-of-the-art image restoration methods. Code is available +at (TBA). + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Object-Centric Multiple Object Tracking ICCV 2023 + + +
+ Unsupervised object-centric learning methods allow the partitioning of scenes +into entities without additional localization information and are excellent +candidates for reducing the annotation burden of multiple-object tracking (MOT) +pipelines. Unfortunately, they lack two key properties: objects are often split +into parts and are not consistently tracked over time. In fact, +state-of-the-art models achieve pixel-level accuracy and temporal consistency +by relying on supervised object detection with additional ID labels for the +association through time. This paper proposes a video object-centric model for +MOT. It consists of an index-merge module that adapts the object-centric slots +into detection outputs and an object memory module that builds complete object +prototypes to handle occlusions. Benefited from object-centric learning, we +only require sparse detection labels (0%-6.25%) for object localization and +feature binding. Relying on our self-supervised +Expectation-Maximization-inspired loss for object association, our approach +requires no ID labels. Our experiments significantly narrow the gap between the +existing object-centric model and the fully supervised state-of-the-art and +outperform several unsupervised trackers. + +
+
+ comment: ICCV 2023 camera-ready version +
+
+
+
+
+ + ♻ ☆ DDColor: Towards Photo-Realistic Image Colorization via Dual Decoders ICCV 2023 + + +
+ Image colorization is a challenging problem due to multi-modal uncertainty +and high ill-posedness. Directly training a deep neural network usually leads +to incorrect semantic colors and low color richness. While transformer-based +methods can deliver better results, they often rely on manually designed +priors, suffer from poor generalization ability, and introduce color bleeding +effects. To address these issues, we propose DDColor, an end-to-end method with +dual decoders for image colorization. Our approach includes a pixel decoder and +a query-based color decoder. The former restores the spatial resolution of the +image, while the latter utilizes rich visual features to refine color queries, +thus avoiding hand-crafted priors. Our two decoders work together to establish +correlations between color and multi-scale semantic representations via +cross-attention, significantly alleviating the color bleeding effect. +Additionally, a simple yet effective colorfulness loss is introduced to enhance +the color richness. Extensive experiments demonstrate that DDColor achieves +superior performance to existing state-of-the-art works both quantitatively and +qualitatively. The codes and models are publicly available at +https://github.com/piddnad/DDColor. + +
+
+ comment: ICCV 2023; Code: https://github.com/piddnad/DDColor +
+
+
+
+
+ + ♻ ☆ Predict to Detect: Prediction-guided 3D Object Detection using + Sequential Images ICCV 2023 + + +
+ Recent camera-based 3D object detection methods have introduced sequential +frames to improve the detection performance hoping that multiple frames would +mitigate the large depth estimation error. Despite improved detection +performance, prior works rely on naive fusion methods (e.g., concatenation) or +are limited to static scenes (e.g., temporal stereo), neglecting the importance +of the motion cue of objects. These approaches do not fully exploit the +potential of sequential images and show limited performance improvements. To +address this limitation, we propose a novel 3D object detection model, P2D +(Predict to Detect), that integrates a prediction scheme into a detection +framework to explicitly extract and leverage motion features. P2D predicts +object information in the current frame using solely past frames to learn +temporal motion features. We then introduce a novel temporal feature +aggregation method that attentively exploits Bird's-Eye-View (BEV) features +based on predicted object information, resulting in accurate 3D object +detection. Experimental results demonstrate that P2D improves mAP and NDS by +3.0% and 3.7% compared to the sequential image-based baseline, illustrating +that incorporating a prediction scheme can significantly improve detection +accuracy. + +
+
+ comment: ICCV 2023, Code: https://github.com/sanmin0312/P2D +
+
+
+
+
+ + ♻ ☆ Boosting the Adversarial Transferability of Surrogate Models with Dark + Knowledge ICTAI + + +
+ Deep neural networks (DNNs) are vulnerable to adversarial examples. And, the +adversarial examples have transferability, which means that an adversarial +example for a DNN model can fool another model with a non-trivial probability. +This gave birth to the transfer-based attack where the adversarial examples +generated by a surrogate model are used to conduct black-box attacks. There are +some work on generating the adversarial examples from a given surrogate model +with better transferability. However, training a special surrogate model to +generate adversarial examples with better transferability is relatively +under-explored. This paper proposes a method for training a surrogate model +with dark knowledge to boost the transferability of the adversarial examples +generated by the surrogate model. This trained surrogate model is named dark +surrogate model (DSM). The proposed method for training a DSM consists of two +key components: a teacher model extracting dark knowledge, and the mixing +augmentation skill enhancing dark knowledge of training data. We conducted +extensive experiments to show that the proposed method can substantially +improve the adversarial transferability of surrogate models across different +architectures of surrogate models and optimizers for generating adversarial +examples, and it can be applied to other scenarios of transfer-based attack +that contain dark knowledge, like face verification. Our code is publicly +available at \url{https://github.com/ydc123/Dark_Surrogate_Model}. + +
+
+ comment: Accepted at 2023 International Conference on Tools with Artificial + Intelligence (ICTAI) +
+
+
+
+
+ + ♻ ☆ Neural Video Compression with Temporal Layer-Adaptive Hierarchical + B-frame Coding + + +
+ Neural video compression (NVC) is a rapidly evolving video coding research +area, with some models achieving superior coding efficiency compared to the +latest video coding standard Versatile Video Coding (VVC). In conventional +video coding standards, the hierarchical B-frame coding, which utilizes a +bidirectional prediction structure for higher compression, had been +well-studied and exploited. In NVC, however, limited research has investigated +the hierarchical B scheme. In this paper, we propose an NVC model exploiting +hierarchical B-frame coding with temporal layer-adaptive optimization. We first +extend an existing unidirectional NVC model to a bidirectional model, which +achieves -21.13% BD-rate gain over the unidirectional baseline model. However, +this model faces challenges when applied to sequences with complex or large +motions, leading to performance degradation. To address this, we introduce +temporal layer-adaptive optimization, incorporating methods such as temporal +layer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent +scaling (TALS). The final model with the proposed methods achieves an +impressive BD-rate gain of -39.86% against the baseline. It also resolves the +challenges in sequences with large or complex motions with up to -49.13% more +BD-rate gains than the simple bidirectional extension. This improvement is +attributed to the allocation of more bits to lower temporal layers, thereby +enhancing overall reconstruction quality with smaller bits. Since our method +has little dependency on a specific NVC model architecture, it can serve as a +general tool for extending unidirectional NVC models to the ones with +hierarchical B-frame coding. + +
+
+
+
+
+ + ♻ ☆ Application of Machine Learning in Melanoma Detection and the + Identification of 'Ugly Duckling' and Suspicious Naevi: A Review + + +
+ Skin lesions known as naevi exhibit diverse characteristics such as size, +shape, and colouration. The concept of an "Ugly Duckling Naevus" comes into +play when monitoring for melanoma, referring to a lesion with distinctive +features that sets it apart from other lesions in the vicinity. As lesions +within the same individual typically share similarities and follow a +predictable pattern, an ugly duckling naevus stands out as unusual and may +indicate the presence of a cancerous melanoma. Computer-aided diagnosis (CAD) +has become a significant player in the research and development field, as it +combines machine learning techniques with a variety of patient analysis +methods. Its aim is to increase accuracy and simplify decision-making, all +while responding to the shortage of specialized professionals. These automated +systems are especially important in skin cancer diagnosis where specialist +availability is limited. As a result, their use could lead to life-saving +benefits and cost reductions within healthcare. Given the drastic change in +survival when comparing early stage to late-stage melanoma, early detection is +vital for effective treatment and patient outcomes. Machine learning (ML) and +deep learning (DL) techniques have gained popularity in skin cancer +classification, effectively addressing challenges, and providing results +equivalent to that of specialists. This article extensively covers modern +Machine Learning and Deep Learning algorithms for detecting melanoma and +suspicious naevi. It begins with general information on skin cancer and +different types of naevi, then introduces AI, ML, DL, and CAD. The article then +discusses the successful applications of various ML techniques like +convolutional neural networks (CNN) for melanoma detection compared to +dermatologists' performance. Lastly, it examines ML methods for UD naevus +detection and identifying suspicious naevi. + +
+
+
+
+
+ + ♻ ☆ RecRecNet: Rectangling Rectified Wide-Angle Images by Thin-Plate Spline + Model and DoF-based Curriculum Learning ICCV 2023 + + +
+ The wide-angle lens shows appealing applications in VR technologies, but it +introduces severe radial distortion into its captured image. To recover the +realistic scene, previous works devote to rectifying the content of the +wide-angle image. However, such a rectification solution inevitably distorts +the image boundary, which changes related geometric distributions and misleads +the current vision perception models. In this work, we explore constructing a +win-win representation on both content and boundary by contributing a new +learning model, i.e., Rectangling Rectification Network (RecRecNet). In +particular, we propose a thin-plate spline (TPS) module to formulate the +non-linear and non-rigid transformation for rectangling images. By learning the +control points on the rectified image, our model can flexibly warp the source +structure to the target domain and achieves an end-to-end unsupervised +deformation. To relieve the complexity of structure approximation, we then +inspire our RecRecNet to learn the gradual deformation rules with a DoF (Degree +of Freedom)-based curriculum learning. By increasing the DoF in each curriculum +stage, namely, from similarity transformation (4-DoF) to homography +transformation (8-DoF), the network is capable of investigating more detailed +deformations, offering fast convergence on the final rectangling task. +Experiments show the superiority of our solution over the compared methods on +both quantitative and qualitative evaluations. The code and dataset are +available at https://github.com/KangLiao929/RecRecNet. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Benchmarking Jetson Edge Devices with an End-to-end Video-based Anomaly + Detection System + + +
+ Innovative enhancement in embedded system platforms, specifically hardware +accelerations, significantly influence the application of deep learning in +real-world scenarios. These innovations translate human labor efforts into +automated intelligent systems employed in various areas such as autonomous +driving, robotics, Internet-of-Things (IoT), and numerous other impactful +applications. NVIDIA's Jetson platform is one of the pioneers in offering +optimal performance regarding energy efficiency and throughput in the execution +of deep learning algorithms. Previously, most benchmarking analysis was based +on 2D images with a single deep learning model for each comparison result. In +this paper, we implement an end-to-end video-based crime-scene anomaly +detection system inputting from surveillance videos and the system is deployed +and completely operates on multiple Jetson edge devices (Nano, AGX Xavier, Orin +Nano). The comparison analysis includes the integration of Torch-TensorRT as a +software developer kit from NVIDIA for the model performance optimisation. The +system is built based on the PySlowfast open-source project from Facebook as +the coding template. The end-to-end system process comprises the videos from +camera, data preprocessing pipeline, feature extractor and the anomaly +detection. We provide the experience of an AI-based system deployment on +various Jetson Edge devices with Docker technology. Regarding anomaly +detectors, a weakly supervised video-based deep learning model called Robust +Temporal Feature Magnitude Learning (RTFM) is applied in the system. The +approach system reaches 47.56 frames per second (FPS) inference speed on a +Jetson edge device with only 3.11 GB RAM usage total. We also discover the +promising Jetson device that the AI system achieves 15% better performance than +the previous version of Jetson devices while consuming 50% less energy power. + +
+
+ comment: 18 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Zero-guidance Segmentation Using Zero Segment Labels + + +
+ CLIP has enabled new and exciting joint vision-language applications, one of +which is open-vocabulary segmentation, which can locate any segment given an +arbitrary text query. In our research, we ask whether it is possible to +discover semantic segments without any user guidance in the form of text +queries or predefined classes, and label them using natural language +automatically? We propose a novel problem zero-guidance segmentation and the +first baseline that leverages two pre-trained generalist models, DINO and CLIP, +to solve this problem without any fine-tuning or segmentation dataset. The +general idea is to first segment an image into small over-segments, encode them +into CLIP's visual-language space, translate them into text labels, and merge +semantically similar segments together. The key challenge, however, is how to +encode a visual segment into a segment-specific embedding that balances global +and local context information, both useful for recognition. Our main +contribution is a novel attention-masking technique that balances the two +contexts by analyzing the attention layers inside CLIP. We also introduce +several metrics for the evaluation of this new task. With CLIP's innate +knowledge, our method can precisely locate the Mona Lisa painting among a +museum crowd. Project page: https://zero-guide-seg.github.io/. + +
+
+
+
+
+ + ♻ ☆ YOLOrtho -- A Unified Framework for Teeth Enumeration and Dental Disease + Detection + + +
+ Detecting dental diseases through panoramic X-rays images is a standard +procedure for dentists. Normally, a dentist need to identify diseases and find +the infected teeth. While numerous machine learning models adopting this +two-step procedure have been developed, there has not been an end-to-end model +that can identify teeth and their associated diseases at the same time. To fill +the gap, we develop YOLOrtho, a unified framework for teeth enumeration and +dental disease detection. We develop our model on Dentex Challenge 2023 data, +which consists of three distinct types of annotated data. The first part is +labeled with quadrant, and the second part is labeled with quadrant and +enumeration and the third part is labeled with quadrant, enumeration and +disease. To further improve detection, we make use of Tufts Dental public +dataset. To fully utilize the data and learn both teeth detection and disease +identification simultaneously, we formulate diseases as attributes attached to +their corresponding teeth. Due to the nature of position relation in teeth +enumeration, We replace convolution layer with CoordConv in our model to +provide more position information for the model. We also adjust the model +architecture and insert one more upsampling layer in FPN in favor of large +object detection. Finally, we propose a post-process strategy for teeth layout +that corrects teeth enumeration based on linear sum assignment. Results from +experiments show that our model exceeds large Diffusion-based model. + +
+
+
+
+
+ + ♻ ☆ BinaryViT: Towards Efficient and Accurate Binary Vision Transformers + + +
+ Vision Transformers (ViTs) have emerged as the fundamental architecture for +most computer vision fields, but the considerable memory and computation costs +hinders their application on resource-limited devices. As one of the most +powerful compression methods, binarization reduces the computation of the +neural network by quantizing the weights and activation values as $\pm$1. +Although existing binarization methods have demonstrated excellent performance +on Convolutional Neural Networks (CNNs), the full binarization of ViTs is still +under-studied and suffering a significant performance drop. In this paper, we +first argue empirically that the severe performance degradation is mainly +caused by the weight oscillation in the binarization training and the +information distortion in the activation of ViTs. Based on these analyses, we +propose $\textbf{BinaryViT}$, an accurate full binarization scheme for ViTs, +which pushes the quantization of ViTs to the limit. Specifically, we propose a +novel gradient regularization scheme (GRS) for driving a bimodal distribution +of the weights to reduce oscillation in binarization training. Moreover, we +design an activation shift module (ASM) to adaptively tune the activation +distribution to reduce the information distortion caused by binarization. +Extensive experiments on ImageNet dataset show that our BinaryViT consistently +surpasses the strong baseline by 2.05% and improve the accuracy of fully +binarized ViTs to a usable level. Furthermore, our method achieves impressive +savings of 16.2$\times$ and 17.7$\times$ in model size and OPs compared to the +full-precision DeiT-S. + +
+
+ comment: We will be making some significant changes to the paper, including + the title and methodology. We therefore wish to withdraw the paper for now +
+
+
+
+
+ + ♻ ☆ Spatially and Spectrally Consistent Deep Functional Maps ICCV2023 + + +
+ Cycle consistency has long been exploited as a powerful prior for jointly +optimizing maps within a collection of shapes. In this paper, we investigate +its utility in the approaches of Deep Functional Maps, which are considered +state-of-the-art in non-rigid shape matching. We first justify that under +certain conditions, the learned maps, when represented in the spectral domain, +are already cycle consistent. Furthermore, we identify the discrepancy that +spectrally consistent maps are not necessarily spatially, or point-wise, +consistent. In light of this, we present a novel design of unsupervised Deep +Functional Maps, which effectively enforces the harmony of learned maps under +the spectral and the point-wise representation. By taking advantage of cycle +consistency, our framework produces state-of-the-art results in mapping shapes +even under significant distortions. Beyond that, by independently estimating +maps in both spectral and spatial domains, our method naturally alleviates +over-fitting in network training, yielding superior generalization performance +and accuracy within an array of challenging tests for both near-isometric and +non-isometric datasets. Codes are available at +https://github.com/rqhuang88/Spatiallyand-Spectrally-Consistent-Deep-Functional-Maps. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Towards Generalist Foundation Model for Radiology + + +
+ In this study, we aim to initiate the development of Radiology Foundation +Model, termed as RadFM.We consider the construction of foundational models from +the perspectives of dataset construction, model design, and thorough +evaluation. Our contribution can be concluded as follows: (i), we construct a +large-scale Medical Multi-modal Dataset, MedMD, which consists of 16M 2D and 3D +medical scans with high-quality text descriptions or reports across various +data formats, modalities, and tasks, covering over 5000 distinct diseases. To +the best of our knowledge, this is the first large-scale, high-quality, medical +visual-language dataset, with both 2D and 3D scans; (ii ), we propose an +architecture that enables visually conditioned generative pre-training, i.e., +allowing for integration of text input with 2D or 3D medical scans, and +generate responses for diverse radiologic tasks. The model was initially +pre-trained on MedMD and subsequently fine-tuned on the domain-specific +dataset, which is a radiologic cleaned version of MedMD, containing 3M +radiologic visual-language pairs, termed as RadMD; (iii), we propose a new +evaluation benchmark, RadBench, that comprises five tasks, including modality +recognition, disease diagnosis, visual question answering, report generation +and rationale diagnosis, aiming to comprehensively assess the capability of +foundation models in handling practical clinical problems. We conduct both +automatic and human evaluation on RadBench, in both cases, RadFM significantly +outperforms existing multi-modal foundation models. The codes, data, and model +checkpoint will all be made publicly available to promote further research and +development in the field. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study on the Language Modal in Visual Question Answering IJCAI2023 + + +
+ Generalization beyond in-domain experience to out-of-distribution data is of +paramount significance in the AI domain. Of late, state-of-the-art Visual +Question Answering (VQA) models have shown impressive performance on in-domain +data, partially due to the language priors bias which, however, hinders the +generalization ability in practice. This paper attempts to provide new insights +into the influence of language modality on VQA performance from an empirical +study perspective. To achieve this, we conducted a series of experiments on six +models. The results of these experiments revealed that, 1) apart from prior +bias caused by question types, there is a notable influence of postfix-related +bias in inducing biases, and 2) training VQA models with word-sequence-related +variant questions demonstrated improved performance on the out-of-distribution +benchmark, and the LXMERT even achieved a 10-point gain without adopting any +debiasing methods. We delved into the underlying reasons behind these +experimental results and put forward some simple proposals to reduce the +models' dependency on language priors. The experimental results demonstrated +the effectiveness of our proposed method in improving performance on the +out-of-distribution benchmark, VQA-CPv2. We hope this study can inspire novel +insights for future research on designing bias-reduction approaches. + +
+
+ comment: Accepted by IJCAI2023 +
+
+
+
+
+ + ♻ ☆ Moby: Empowering 2D Models for Efficient Point Cloud Analytics on the + Edge + + +
+ 3D object detection plays a pivotal role in many applications, most notably +autonomous driving and robotics. These applications are commonly deployed on +edge devices to promptly interact with the environment, and often require near +real-time response. With limited computation power, it is challenging to +execute 3D detection on the edge using highly complex neural networks. Common +approaches such as offloading to the cloud induce significant latency overheads +due to the large amount of point cloud data during transmission. To resolve the +tension between wimpy edge devices and compute-intensive inference workloads, +we explore the possibility of empowering fast 2D detection to extrapolate 3D +bounding boxes. To this end, we present Moby, a novel system that demonstrates +the feasibility and potential of our approach. We design a transformation +pipeline for Moby that generates 3D bounding boxes efficiently and accurately +based on 2D detection results without running 3D detectors. Further, we devise +a frame offloading scheduler that decides when to launch the 3D detector +judiciously in the cloud to avoid the errors from accumulating. Extensive +evaluations on NVIDIA Jetson TX2 with real-world autonomous driving datasets +demonstrate that Moby offers up to 91.9% latency improvement with modest +accuracy loss over state of the art. + +
+
+ comment: Accepted to ACM International Conference on Multimedia (MM) 2023 +
+
+
+
+
+ + ♻ ☆ Cross-modal Orthogonal High-rank Augmentation for RGB-Event + Transformer-trackers ICCV + + +
+ This paper addresses the problem of cross-modal object tracking from RGB +videos and event data. Rather than constructing a complex cross-modal fusion +network, we explore the great potential of a pre-trained vision Transformer +(ViT). Particularly, we delicately investigate plug-and-play training +augmentations that encourage the ViT to bridge the vast distribution gap +between the two modalities, enabling comprehensive cross-modal information +interaction and thus enhancing its ability. Specifically, we propose a mask +modeling strategy that randomly masks a specific modality of some tokens to +enforce the interaction between tokens from different modalities interacting +proactively. To mitigate network oscillations resulting from the masking +strategy and further amplify its positive effect, we then theoretically propose +an orthogonal high-rank loss to regularize the attention matrix. Extensive +experiments demonstrate that our plug-and-play training augmentation techniques +can significantly boost state-of-the-art one-stream and twostream trackers to a +large extent in terms of both tracking precision and success rate. Our new +perspective and findings will potentially bring insights to the field of +leveraging powerful pre-trained ViTs to model cross-modal data. The code will +be publicly available. + +
+
+ comment: accepted by ICCV +
+
+
+
+
+ + ♻ ☆ Multi-Prompt with Depth Partitioned Cross-Modal Learning + + +
+ In recent years, soft prompt learning methods have been proposed to fine-tune +large-scale vision-language pre-trained models for various downstream tasks. +These methods typically combine learnable textual tokens with class tokens as +input for models with frozen parameters. However, they often employ a single +prompt to describe class contexts, failing to capture categories' diverse +attributes adequately. This study introduces the Partitioned Multi-modal Prompt +(PMPO), a multi-modal prompting technique that extends the soft prompt from a +single learnable prompt to multiple prompts. Our method divides the visual +encoder depths and connects learnable prompts to the separated visual depths, +enabling different prompts to capture the hierarchical contextual depths of +visual representations. Furthermore, to maximize the advantages of multi-prompt +learning, we incorporate prior information from manually designed templates and +learnable multi-prompts, thus improving the generalization capabilities of our +approach. We evaluate the effectiveness of our approach on three challenging +tasks: new class generalization, cross-dataset evaluation, and domain +generalization. For instance, our method achieves a $79.28$ harmonic mean, +averaged over 11 diverse image recognition datasets ($+7.62$ compared to CoOp), +demonstrating significant competitiveness compared to state-of-the-art +prompting methods. + +
+
+
+
+
+ + ♻ ☆ Learning to Sample Tasks for Meta Learning + + +
+ Through experiments on various meta-learning methods, task samplers, and +few-shot learning tasks, this paper arrives at three conclusions. Firstly, +there are no universal task sampling strategies to guarantee the performance of +meta-learning models. Secondly, task diversity can cause the models to either +underfit or overfit during training. Lastly, the generalization performance of +the models are influenced by task divergence, task entropy, and task +difficulty. In response to these findings, we propose a novel task sampler +called Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes +task divergence, task entropy, and task difficulty to sample tasks. To optimize +ASr, we rethink and propose a simple and general meta-learning algorithm. +Finally, a large number of empirical experiments demonstrate the effectiveness +of the proposed ASr. + +
+
+ comment: 10 pages, 7 tables, 3 figures +
+
+
+
+
+ + ♻ ☆ Tuning Pre-trained Model via Moment Probing ICCV 2023 + + +
+ Recently, efficient fine-tuning of large-scale pre-trained models has +attracted increasing research interests, where linear probing (LP) as a +fundamental module is involved in exploiting the final representations for +task-dependent classification. However, most of the existing methods focus on +how to effectively introduce a few of learnable parameters, and little work +pays attention to the commonly used LP module. In this paper, we propose a +novel Moment Probing (MP) method to further explore the potential of LP. +Distinguished from LP which builds a linear classification head based on the +mean of final features (e.g., word tokens for ViT) or classification tokens, +our MP performs a linear classifier on feature distribution, which provides the +stronger representation ability by exploiting richer statistical information +inherent in features. Specifically, we represent feature distribution by its +characteristic function, which is efficiently approximated by using first- and +second-order moments of features. Furthermore, we propose a multi-head +convolutional cross-covariance (MHC$^3$) to compute second-order moments in an +efficient and effective manner. By considering that MP could affect feature +learning, we introduce a partially shared module to learn two recalibrating +parameters (PSRP) for backbones based on MP, namely MP$_{+}$. Extensive +experiments on ten benchmarks using various models show that our MP +significantly outperforms LP and is competitive with counterparts at less +training cost, while our MP$_{+}$ achieves state-of-the-art performance. + +
+
+ comment: Accepted to ICCV 2023; Project Page: + https://github.com/mingzeG/Moment-Probing +
+
+
+
+
+ + ♻ ☆ RS2G: Data-Driven Scene-Graph Extraction and Embedding for Robust + Autonomous Perception and Scenario Understanding + + +
+ Effectively capturing intricate interactions among road users is of critical +importance to achieving safe navigation for autonomous vehicles. While graph +learning (GL) has emerged as a promising approach to tackle this challenge, +existing GL models rely on predefined domain-specific graph extraction rules +that often fail in real-world drastically changing scenarios. Additionally, +these graph extraction rules severely impede the capability of existing GL +methods to generalize knowledge across domains. To address this issue, we +propose RoadScene2Graph (RS2G), an innovative autonomous scenario understanding +framework with a novel data-driven graph extraction and modeling approach that +dynamically captures the diverse relations among road users. Our evaluations +demonstrate that on average RS2G outperforms the state-of-the-art (SOTA) +rule-based graph extraction method by 4.47% and the SOTA deep learning model by +22.19% in subjective risk assessment. More importantly, RS2G delivers notably +better performance in transferring knowledge gained from simulation +environments to unseen real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ Fed-CPrompt: Contrastive Prompt for Rehearsal-Free Federated Continual + Learning ICML 2023 + + +
+ Federated continual learning (FCL) learns incremental tasks over time from +confidential datasets distributed across clients. This paper focuses on +rehearsal-free FCL, which has severe forgetting issues when learning new tasks +due to the lack of access to historical task data. To address this issue, we +propose Fed-CPrompt based on prompt learning techniques to obtain task-specific +prompts in a communication-efficient way. Fed-CPrompt introduces two key +components, asynchronous prompt learning, and contrastive continual loss, to +handle asynchronous task arrival and heterogeneous data distributions in FCL, +respectively. Extensive experiments demonstrate the effectiveness of +Fed-CPrompt in achieving SOTA rehearsal-free FCL performance. + +
+
+ comment: Accepted by FL-ICML 2023 +
+
+
+
+
+ + ♻ ☆ Probabilistic and Semantic Descriptions of Image Manifolds and Their + Applications + + +
+ This paper begins with a description of methods for estimating probability +density functions for images that reflects the observation that such data is +usually constrained to lie in restricted regions of the high-dimensional image +space - not every pattern of pixels is an image. It is common to say that +images lie on a lower-dimensional manifold in the high-dimensional space. +However, although images may lie on such lower-dimensional manifolds, it is not +the case that all points on the manifold have an equal probability of being +images. Images are unevenly distributed on the manifold, and our task is to +devise ways to model this distribution as a probability distribution. In +pursuing this goal, we consider generative models that are popular in AI and +computer vision community. For our purposes, generative/probabilistic models +should have the properties of 1) sample generation: it should be possible to +sample from this distribution according to the modelled density function, and +2) probability computation: given a previously unseen sample from the dataset +of interest, one should be able to compute the probability of the sample, at +least up to a normalising constant. To this end, we investigate the use of +methods such as normalising flow and diffusion models. We then show that such +probabilistic descriptions can be used to construct defences against +adversarial attacks. In addition to describing the manifold in terms of +density, we also consider how semantic interpretations can be used to describe +points on the manifold. To this end, we consider an emergent language framework +which makes use of variational encoders to produce a disentangled +representation of points that reside on a given manifold. Trajectories between +points on a manifold can then be described in terms of evolving semantic +descriptions. + +
+
+ comment: 24 pages, 17 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Spherical and Hyperbolic Toric Topology-Based Codes On Graph Embedding + for Ising MRF Models: Classical and Quantum Topology Machine Learning + + +
+ The paper introduces the application of information geometry to describe the +ground states of Ising models by utilizing parity-check matrices of cyclic and +quasi-cyclic codes on toric and spherical topologies. The approach establishes +a connection between machine learning and error-correcting coding. This +proposed approach has implications for the development of new embedding methods +based on trapping sets. Statistical physics and number geometry applied for +optimize error-correcting codes, leading to these embedding and sparse +factorization methods. The paper establishes a direct connection between DNN +architecture and error-correcting coding by demonstrating how state-of-the-art +architectures (ChordMixer, Mega, Mega-chunk, CDIL, ...) from the long-range +arena can be equivalent to of block and convolutional LDPC codes (Cage-graph, +Repeat Accumulate). QC codes correspond to certain types of chemical elements, +with the carbon element being represented by the mixed automorphism +Shu-Lin-Fossorier QC-LDPC code. The connections between Belief Propagation and +the Permanent, Bethe-Permanent, Nishimori Temperature, and Bethe-Hessian Matrix +are elaborated upon in detail. The Quantum Approximate Optimization Algorithm +(QAOA) used in the Sherrington-Kirkpatrick Ising model can be seen as analogous +to the back-propagation loss function landscape in training DNNs. This +similarity creates a comparable problem with TS pseudo-codeword, resembling the +belief propagation method. Additionally, the layer depth in QAOA correlates to +the number of decoding belief propagation iterations in the Wiberg decoding +tree. Overall, this work has the potential to advance multiple fields, from +Information Theory, DNN architecture design (sparse and structured prior graph +topology), efficient hardware design for Quantum and Classical DPU/TPU (graph, +quantize and shift register architect.) to Materials Science and beyond. + +
+
+ comment: 71 pages, 42 Figures, 1 Table, 1 Appendix. arXiv admin note: text + overlap with arXiv:2109.08184 by other authors +
+
+
+
+
+ + ♻ ☆ Visualizing chest X-ray dataset biases using GANs + + +
+ Recent work demonstrates that images from various chest X-ray datasets +contain visual features that are strongly correlated with protected demographic +attributes like race and gender. This finding raises issues of fairness, since +some of these factors may be used by downstream algorithms for clinical +predictions. In this work, we propose a framework, using generative adversarial +networks (GANs), to visualize what features are most different between X-rays +belonging to two demographic subgroups. + +
+
+ comment: Medical Imaging with Deep Learning(MIDL) 2023 +
+
+
+
+
+ + ♻ ☆ Socratis: Are large multimodal models emotionally aware? ICCV 2023 + + +
+ Existing emotion prediction benchmarks contain coarse emotion labels which do +not consider the diversity of emotions that an image and text can elicit in +humans due to various reasons. Learning diverse reactions to multimodal content +is important as intelligent machines take a central role in generating and +delivering content to society. To address this gap, we propose Socratis, a +societal reactions benchmark, where each image-caption (IC) pair is annotated +with multiple emotions and the reasons for feeling them. Socratis contains 18K +free-form reactions for 980 emotions on 2075 image-caption pairs from 5 +widely-read news and image-caption (IC) datasets. We benchmark the capability +of state-of-the-art multimodal large language models to generate the reasons +for feeling an emotion given an IC pair. Based on a preliminary human study, we +observe that humans prefer human-written reasons over 2 times more often than +machine-generated ones. This shows our task is harder than standard generation +tasks because it starkly contrasts recent findings where humans cannot tell +apart machine vs human-written news articles, for instance. We further see that +current captioning metrics based on large vision-language models also fail to +correlate with human preferences. We hope that these findings and our benchmark +will inspire further research on training emotionally aware models. + +
+
+ comment: ICCV 2023 WECIA +
+
+
+
+
+ + ♻ ☆ Text-to-Image Diffusion Models are Zero-Shot Classifiers + + +
+ The excellent generative capabilities of text-to-image diffusion models +suggest they learn informative representations of image-text data. However, +what knowledge their representations capture is not fully understood, and they +have not been thoroughly explored on downstream tasks. We investigate diffusion +models by proposing a method for evaluating them as zero-shot classifiers. The +key idea is using a diffusion model's ability to denoise a noised image given a +text description of a label as a proxy for that label's likelihood. We apply +our method to Stable Diffusion and Imagen, using it to probe fine-grained +aspects of the models' knowledge and comparing them with CLIP's zero-shot +abilities. They perform competitively with CLIP on a wide range of zero-shot +image classification datasets. Additionally, they achieve state-of-the-art +results on shape/texture bias tests and can successfully perform attribute +binding while CLIP cannot. Although generative pre-training is prevalent in +NLP, visual foundation models often use other methods such as contrastive +learning. Based on our findings, we argue that generative pre-training should +be explored as a compelling alternative for vision-language tasks. + +
+
+
+
+
+ + ♻ ☆ Salient Object Detection for Images Taken by People With Vision + Impairments + + +
+ Salient object detection is the task of producing a binary mask for an image +that deciphers which pixels belong to the foreground object versus background. +We introduce a new salient object detection dataset using images taken by +people who are visually impaired who were seeking to better understand their +surroundings, which we call VizWiz-SalientObject. Compared to seven existing +datasets, VizWiz-SalientObject is the largest (i.e., 32,000 human-annotated +images) and contains unique characteristics including a higher prevalence of +text in the salient objects (i.e., in 68\% of images) and salient objects that +occupy a larger ratio of the images (i.e., on average, $\sim$50\% coverage). We +benchmarked seven modern salient object detection methods on our dataset and +found they struggle most with images featuring salient objects that are large, +have less complex boundaries, and lack text as well as for lower quality +images. We invite the broader community to work on our new dataset challenge by +publicly sharing the dataset at +https://vizwiz.org/tasks-and-datasets/salient-object . + +
+
+ comment: Computer Vision and Pattern Recognition +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Fairness of Exposure in Dynamic Recommendation + + +
+ Exposure bias is a well-known issue in recommender systems where the exposure +is not fairly distributed among items in the recommendation results. This is +especially problematic when bias is amplified over time as a few items (e.g., +popular ones) are repeatedly over-represented in recommendation lists and +users' interactions with those items will amplify bias towards those items over +time resulting in a feedback loop. This issue has been extensively studied in +the literature in static recommendation environment where a single round of +recommendation result is processed to improve the exposure fairness. However, +less work has been done on addressing exposure bias in a dynamic recommendation +setting where the system is operating over time, the recommendation model and +the input data are dynamically updated with ongoing user feedback on +recommended items at each round. In this paper, we study exposure bias in a +dynamic recommendation setting. Our goal is to show that existing bias +mitigation methods that are designed to operate in a static recommendation +setting are unable to satisfy fairness of exposure for items in long run. In +particular, we empirically study one of these methods and show that repeatedly +applying this method fails to fairly distribute exposure among items in long +run. To address this limitation, we show how this method can be adapted to +effectively operate in a dynamic recommendation setting and achieve exposure +fairness for items in long run. Experiments on a real-world dataset confirm +that our solution is superior in achieving long-term exposure fairness for the +items while maintaining the recommendation accuracy. + +
+
+
+
+
+ + ☆ STGIN: Spatial-Temporal Graph Interaction Network for Large-scale POI + Recommendation CIKM 2023 + + +
+ In Location-Based Services, Point-Of-Interest(POI) recommendation plays a +crucial role in both user experience and business opportunities. Graph neural +networks have been proven effective in providing personalized POI +recommendation services. However, there are still two critical challenges. +First, existing graph models attempt to capture users' diversified interests +through a unified graph, which limits their ability to express interests in +various spatial-temporal contexts. Second, the efficiency limitations of graph +construction and graph sampling in large-scale systems make it difficult to +adapt quickly to new real-time interests. To tackle the above challenges, we +propose a novel Spatial-Temporal Graph Interaction Network. Specifically, we +construct subgraphs of spatial, temporal, spatial-temporal, and global views +respectively to precisely characterize the user's interests in various +contexts. In addition, we design an industry-friendly framework to track the +user's latest interests. Extensive experiments on the real-world dataset show +that our method outperforms state-of-the-art models. This work has been +successfully deployed in a large e-commerce platform, delivering a 1.1% CTR and +6.3% RPM improvement. + +
+
+ comment: accepted by CIKM 2023 +
+
+
+
+
+ + ☆ TensorBank:Tensor Lakehouse for Foundation Model Training + + +
+ Storing and streaming high dimensional data for foundation model training +became a critical requirement with the rise of foundation models beyond natural +language. In this paper we introduce TensorBank, a petabyte scale tensor +lakehouse capable of streaming tensors from Cloud Object Store (COS) to GPU +memory at wire speed based on complex relational queries. We use Hierarchical +Statistical Indices (HSI) for query acceleration. Our architecture allows to +directly address tensors on block level using HTTP range reads. Once in GPU +memory, data can be transformed using PyTorch transforms. We provide a generic +PyTorch dataset type with a corresponding dataset factory translating +relational queries and requested transformations as an instance. By making use +of the HSI, irrelevant blocks can be skipped without reading them as those +indices contain statistics on their content at different hierarchical +resolution levels. This is an opinionated architecture powered by open +standards and making heavy use of open-source technology. Although, hardened +for production use using geospatial-temporal data, this architecture +generalizes to other use case like computer vision, computational neuroscience, +biological sequence analysis and more. + +
+
+
+
+
+ + ☆ MvFS: Multi-view Feature Selection for Recommender System CIKM 2023 + + +
+ Feature selection, which is a technique to select key features in recommender +systems, has received increasing research attention. Recently, Adaptive Feature +Selection (AdaFS) has shown remarkable performance by adaptively selecting +features for each data instance, considering that the importance of a given +feature field can vary significantly across data. However, this method still +has limitations in that its selection process could be easily biased to major +features that frequently occur. To address these problems, we propose +Multi-view Feature Selection (MvFS), which selects informative features for +each instance more effectively. Most importantly, MvFS employs a multi-view +network consisting of multiple sub-networks, each of which learns to measure +the feature importance of a part of data with different feature patterns. By +doing so, MvFS promotes a more balanced feature selection process mitigating +the bias problem towards dominant patterns. Moreover, MvFS adopts an effective +importance score modeling strategy which is applied independently to each field +without incurring dependency among features. Experimental results on real-world +datasets demonstrate the effectiveness of MvFS compared to state-of-the-art +baselines. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ Scenario-Aware Hierarchical Dynamic Network for Multi-Scenario + Recommendation + + +
+ Click-Through Rate (CTR) prediction is a fundamental technique in +recommendation and advertising systems. Recent studies have shown that +implementing multi-scenario recommendations contributes to strengthening +information sharing and improving overall performance. However, existing +multi-scenario models only consider coarse-grained explicit scenario modeling +that depends on pre-defined scenario identification from manual prior rules, +which is biased and sub-optimal. To address these limitations, we propose a +Scenario-Aware Hierarchical Dynamic Network for Multi-Scenario Recommendations +(HierRec), which perceives implicit patterns adaptively and conducts explicit +and implicit scenario modeling jointly. In particular, HierRec designs a basic +scenario-oriented module based on the dynamic weight to capture +scenario-specific information. Then the hierarchical explicit and implicit +scenario-aware modules are proposed to model hybrid-grained scenario +information. The multi-head implicit modeling design contributes to perceiving +distinctive patterns from different perspectives. Our experiments on two public +datasets and real-world industrial applications on a mainstream online +advertising platform demonstrate that our HierRec outperforms existing models +significantly. + +
+
+
+
+
+ + ☆ Robust Recommender System: A Survey and Future Directions + + +
+ With the rapid growth of information, recommender systems have become +integral for providing personalized suggestions and overcoming information +overload. However, their practical deployment often encounters "dirty" data, +where noise or malicious information can lead to abnormal recommendations. +Research on improving recommender systems' robustness against such dirty data +has thus gained significant attention. This survey provides a comprehensive +review of recent work on recommender systems' robustness. We first present a +taxonomy to organize current techniques for withstanding malicious attacks and +natural noise. We then explore state-of-the-art methods in each category, +including fraudster detection, adversarial training, certifiable robust +training against malicious attacks, and regularization, purification, +self-supervised learning against natural noise. Additionally, we summarize +evaluation metrics and common datasets used to assess robustness. We discuss +robustness across varying recommendation scenarios and its interplay with other +properties like accuracy, interpretability, privacy, and fairness. Finally, we +delve into open issues and future research directions in this emerging field. +Our goal is to equip readers with a holistic understanding of robust +recommender systems and spotlight pathways for future research and development. + +
+
+
+
+
+ + ☆ Towards Individual and Multistakeholder Fairness in Tourism Recommender + Systems RecSys 2023 + + +
+ This position paper summarizes our published review on individual and +multistakeholder fairness in Tourism Recommender Systems (TRS). Recently, there +has been growing attention to fairness considerations in recommender systems +(RS). It has been acknowledged in research that fairness in RS is often closely +tied to the presence of multiple stakeholders, such as end users, item +providers, and platforms, as it raises concerns for the fair treatment of all +parties involved. Hence, fairness in RS is a multi-faceted concept that +requires consideration of the perspectives and needs of the different +stakeholders to ensure fair outcomes for them. However, there may often be +instances where achieving the goals of one stakeholder could conflict with +those of another, resulting in trade-offs. + In this paper, we emphasized addressing the unique challenges of ensuring +fairness in RS within the tourism domain. We aimed to discuss potential +strategies for mitigating the aforementioned challenges and examine the +applicability of solutions from other domains to tackle fairness issues in +tourism. By exploring cross-domain approaches and strategies for incorporating +S-Fairness, we can uncover valuable insights and determine how these solutions +can be adapted and implemented effectively in the context of tourism to enhance +fairness in RS. + +
+
+ comment: Position Paper for FAcctRec 2023 at RecSys 2023 +
+
+
+
+
+ + ☆ Tidying Up the Conversational Recommender Systems' Biases + + +
+ The growing popularity of language models has sparked interest in +conversational recommender systems (CRS) within both industry and research +circles. However, concerns regarding biases in these systems have emerged. +While individual components of CRS have been subject to bias studies, a +literature gap remains in understanding specific biases unique to CRS and how +these biases may be amplified or reduced when integrated into complex CRS +models. In this paper, we provide a concise review of biases in CRS by +surveying recent literature. We examine the presence of biases throughout the +system's pipeline and consider the challenges that arise from combining +multiple models. Our study investigates biases in classic recommender systems +and their relevance to CRS. Moreover, we address specific biases in CRS, +considering variations with and without natural language understanding +capabilities, along with biases related to dialogue systems and language +models. Through our findings, we highlight the necessity of adopting a holistic +perspective when dealing with biases in complex CRS models. + +
+
+
+
+
+ + ♻ ☆ Towards Long-Tailed Recognition for Graph Classification via + Collaborative Experts + + +
+ Graph classification, aiming at learning the graph-level representations for +effective class assignments, has received outstanding achievements, which +heavily relies on high-quality datasets that have balanced class distribution. +In fact, most real-world graph data naturally presents a long-tailed form, +where the head classes occupy much more samples than the tail classes, it thus +is essential to study the graph-level classification over long-tailed data +while still remaining largely unexplored. However, most existing long-tailed +learning methods in visions fail to jointly optimize the representation +learning and classifier training, as well as neglect the mining of the +hard-to-classify classes. Directly applying existing methods to graphs may lead +to sub-optimal performance, since the model trained on graphs would be more +sensitive to the long-tailed distribution due to the complex topological +characteristics. Hence, in this paper, we propose a novel long-tailed +graph-level classification framework via Collaborative Multi-expert Learning +(CoMe) to tackle the problem. To equilibrate the contributions of head and tail +classes, we first develop balanced contrastive learning from the view of +representation learning, and then design an individual-expert classifier +training based on hard class mining. In addition, we execute gated fusion and +disentangled knowledge distillation among the multiple experts to promote the +collaboration in a multi-expert framework. Comprehensive experiments are +performed on seven widely-used benchmark datasets to demonstrate the +superiority of our method CoMe over state-of-the-art baselines. + +
+
+ comment: Accepted by IEEE Transactions on Big Data (TBD 2024) +
+
+
+
+
+ + ♻ ☆ AMR4NLI: Interpretable and robust NLI measures from semantic graphs + + +
+ The task of natural language inference (NLI) asks whether a given premise +(expressed in NL) entails a given NL hypothesis. NLI benchmarks contain human +ratings of entailment, but the meaning relationships driving these ratings are +not formalized. Can the underlying sentence pair relationships be made more +explicit in an interpretable yet robust fashion? We compare semantic structures +to represent premise and hypothesis, including sets of contextualized +embeddings and semantic graphs (Abstract Meaning Representations), and measure +whether the hypothesis is a semantic substructure of the premise, utilizing +interpretable metrics. Our evaluation on three English benchmarks finds value +in both contextualized embeddings and semantic graphs; moreover, they provide +complementary signals, and can be leveraged together in a hybrid model. + +
+
+ comment: International Conference on Computational Semantics (IWCS 2023); v2 + fixes an imprecise sentence below Eq. 5 +
+
+
+
+
+ + ♻ ☆ How Expressive are Graph Neural Networks in Recommendation? CIKM + + +
+ Graph Neural Networks (GNNs) have demonstrated superior performance on +various graph learning tasks, including recommendation, where they leverage +user-item collaborative filtering signals in graphs. However, theoretical +formulations of their capability are scarce, despite their empirical +effectiveness in state-of-the-art recommender models. Recently, research has +explored the expressiveness of GNNs in general, demonstrating that message +passing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that +GNNs combined with random node initialization are universal. Nevertheless, the +concept of "expressiveness" for GNNs remains vaguely defined. Most existing +works adopt the graph isomorphism test as the metric of expressiveness, but +this graph-level task may not effectively assess a model's ability in +recommendation, where the objective is to distinguish nodes of different +closeness. In this paper, we provide a comprehensive theoretical analysis of +the expressiveness of GNNs in recommendation, considering three levels of +expressiveness metrics: graph isomorphism (graph-level), node automorphism +(node-level), and topological closeness (link-level). We propose the +topological closeness metric to evaluate GNNs' ability to capture the +structural distance between nodes, which aligns closely with the objective of +recommendation. To validate the effectiveness of this new metric in evaluating +recommendation performance, we introduce a learning-less GNN algorithm that is +optimal on the new metric and can be optimal on the node-level metric with +suitable modification. We conduct extensive experiments comparing the proposed +algorithm against various types of state-of-the-art GNN models to explore the +explainability of the new metric in the recommendation task. For +reproducibility, implementation codes are available at +https://github.com/HKUDS/GTE. + +
+
+ comment: 32nd ACM International Conference on Information and Knowledge + Management (CIKM) 2023 +
+
+
+
+
+ + ♻ ☆ STUDY: Socially Aware Temporally Causal Decoder Recommender Systems + + +
+ Recommender systems are widely used to help people find items that are +tailored to their interests. These interests are often influenced by social +networks, making it important to use social network information effectively in +recommender systems. This is especially true for demographic groups with +interests that differ from the majority. This paper introduces STUDY, a +Socially-aware Temporally caUsal Decoder recommender sYstem. STUDY introduces a +new socially-aware recommender system architecture that is significantly more +efficient to learn and train than existing methods. STUDY performs joint +inference over socially connected groups in a single forward pass of a modified +transformer decoder network. We demonstrate the benefits of STUDY in the +recommendation of books for students who are dyslexic, or struggling readers. +Dyslexic students often have difficulty engaging with reading material, making +it critical to recommend books that are tailored to their interests. We worked +with our non-profit partner Learning Ally to evaluate STUDY on a dataset of +struggling readers. STUDY was able to generate recommendations that more +accurately predicted student engagement, when compared with existing methods. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Dual Correction Strategy for Ranking Distillation in Top-N Recommender + System CIKM 2021 + + +
+ Knowledge Distillation (KD), which transfers the knowledge of a well-trained +large model (teacher) to a small model (student), has become an important area +of research for practical deployment of recommender systems. Recently, Relaxed +Ranking Distillation (RRD) has shown that distilling the ranking information in +the recommendation list significantly improves the performance. However, the +method still has limitations in that 1) it does not fully utilize the +prediction errors of the student model, which makes the training not fully +efficient, and 2) it only distills the user-side ranking information, which +provides an insufficient view under the sparse implicit feedback. This paper +presents Dual Correction strategy for Distillation (DCD), which transfers the +ranking information from the teacher model to the student model in a more +efficient manner. Most importantly, DCD uses the discrepancy between the +teacher model and the student model predictions to decide which knowledge to be +distilled. By doing so, DCD essentially provides the learning guidance tailored +to "correcting" what the student model has failed to accurately predict. This +process is applied for transferring the ranking information from the user-side +as well as the item-side to address sparse implicit user feedback. Our +experiments show that the proposed method outperforms the state-of-the-art +baselines, and ablation studies validate the effectiveness of each component. + +
+
+ comment: CIKM 2021 +
+
+
+
+
+ + ♻ ☆ DELTA: Dynamic Embedding Learning with Truncated Conscious Attention for + CTR Prediction + + +
+ Click-Through Rate (CTR) prediction is a pivotal task in product and content +recommendation, where learning effective feature embeddings is of great +significance. However, traditional methods typically learn fixed feature +representations without dynamically refining feature representations according +to the context information, leading to suboptimal performance. Some recent +approaches attempt to address this issue by learning bit-wise weights or +augmented embeddings for feature representations, but suffer from uninformative +or redundant features in the context. To tackle this problem, inspired by the +Global Workspace Theory in conscious processing, which posits that only a +specific subset of the product features are pertinent while the rest can be +noisy and even detrimental to human-click behaviors, we propose a CTR model +that enables Dynamic Embedding Learning with Truncated Conscious Attention for +CTR prediction, termed DELTA. DELTA contains two key components: (I) conscious +truncation module (CTM), which utilizes curriculum learning to apply adaptive +truncation on attention weights to select the most critical feature in the +context; (II) explicit embedding optimization (EEO), which applies an auxiliary +task during training that directly and independently propagates the gradient +from the loss layer to the embedding layer, thereby optimizing the embedding +explicitly via linear feature crossing. Extensive experiments on five +challenging CTR datasets demonstrate that DELTA achieves new state-of-art +performance among current CTR methods. + +
+
+
+
+
+ + ♻ ☆ CTRL: Connect Collaborative and Language Model for CTR Prediction + + +
+ Traditional click-through rate (CTR) prediction models convert the tabular +data into one-hot vectors and leverage the collaborative relations among +features for inferring user's preference over items. This modeling paradigm +discards essential semantic information. Though some works like P5 and M6-Rec +have explored the potential of using Pre-trained Language Models (PLMs) to +extract semantic signals for CTR prediction, they are computationally expensive +and suffer from low efficiency. Besides, the beneficial collaborative relations +are not considered, hindering the recommendation performance. To solve these +problems, in this paper, we propose a novel framework \textbf{CTRL}, which is +industrial friendly and model-agnostic with superior inference efficiency. +Specifically, the original tabular data is first converted into textual data. +Both tabular data and converted textual data are regarded as two different +modalities and are separately fed into the collaborative CTR model and +pre-trained language model. A cross-modal knowledge alignment procedure is +performed to fine-grained align and integrate the collaborative and semantic +signals, and the lightweight collaborative model can be deployed online for +efficient serving after fine-tuned with supervised signals. Experimental +results on three public datasets show that CTRL outperforms the +state-of-the-art (SOTA) CTR models significantly. Moreover, we further verify +its effectiveness on a large-scale industrial recommender system. + +
+
+
+
+
+ + ♻ ☆ Multimodality Fusion for Smart Healthcare: a Journey from Data, + Information, Knowledge to Wisdom + + +
+ Multimodal medical data fusion has emerged as a transformative approach in +smart healthcare, enabling a comprehensive understanding of patient health and +personalized treatment plans. In this paper, a journey from data, information, +and knowledge to wisdom (DIKW) is explored through multimodal fusion for smart +healthcare. A comprehensive review of multimodal medical data fusion focuses on +the integration of various data modalities are presented. It explores different +approaches such as Feature selection, Rule-based systems, Machine learning, +Deep learning, and Natural Language Processing for fusing and analyzing +multimodal data. The paper also highlights the challenges associated with +multimodal fusion in healthcare. By synthesizing the reviewed frameworks and +insights, a generic framework for multimodal medical data fusion is proposed +while aligning with the DIKW mechanism. Moreover, it discusses future +directions aligned with the four pillars of healthcare: Predictive, Preventive, +Personalized, and Participatory approaches based on the DIKW and the generic +framework. The components from this comprehensive survey form the foundation +for the successful implementation of multimodal fusion in smart healthcare. The +findings of this survey can guide researchers and practitioners in leveraging +the power of multimodal fusion with the approaches to revolutionize healthcare +and improve patient outcomes. + +
+
+ comment: This work has been submitted to the ELSEVIER for possible + publication. Copyright may be transferred without notice, after which this + version may no longer be accessible +
+
+
+
+
+
+
+
+ + Machine Learning 145 + +
+
+
+ + ☆ Efficient RL via Disentangled Environment and Agent Representations ICML 2023 + + +
+ Agents that are aware of the separation between themselves and their +environments can leverage this understanding to form effective representations +of visual input. We propose an approach for learning such structured +representations for RL algorithms, using visual knowledge of the agent, such as +its shape or mask, which is often inexpensive to obtain. This is incorporated +into the RL objective using a simple auxiliary loss. We show that our method, +Structured Environment-Agent Representations, outperforms state-of-the-art +model-free approaches over 18 different challenging visual simulation +environments spanning 5 different robots. Website at https://sear-rl.github.io/ + +
+
+ comment: ICML 2023. Website at https://sear-rl.github.io/ +
+
+
+
+
+ + ☆ Building a Winning Team: Selecting Source Model Ensembles using a + Submodular Transferability Estimation Approach ICCV 2023 + + +
+ Estimating the transferability of publicly available pretrained models to a +target task has assumed an important place for transfer learning tasks in +recent years. Existing efforts propose metrics that allow a user to choose one +model from a pool of pre-trained models without having to fine-tune each model +individually and identify one explicitly. With the growth in the number of +available pre-trained models and the popularity of model ensembles, it also +becomes essential to study the transferability of multiple-source models for a +given target task. The few existing efforts study transferability in such +multi-source ensemble settings using just the outputs of the classification +layer and neglect possible domain or task mismatch. Moreover, they overlook the +most important factor while selecting the source models, viz., the cohesiveness +factor between them, which can impact the performance and confidence in the +prediction of the ensemble. To address these gaps, we propose a novel Optimal +tranSport-based suBmOdular tRaNsferability metric (OSBORN) to estimate the +transferability of an ensemble of models to a downstream task. OSBORN +collectively accounts for image domain difference, task difference, and +cohesiveness of models in the ensemble to provide reliable estimates of +transferability. We gauge the performance of OSBORN on both image +classification and semantic segmentation tasks. Our setup includes 28 source +datasets, 11 target datasets, 5 model architectures, and 2 pre-training +methods. We benchmark our method against current state-of-the-art metrics +MS-LEEP and E-LEEP, and outperform them consistently using the proposed +approach. + +
+
+ comment: To appear at ICCV 2023 +
+
+
+
+
+ + ☆ Tensorization: Creating and Utilising Multidimensional Datasets for + Multiway Analysis and Tensorised Deep Neural Networks -- Python Tutorial and + Survey + + +
+ As the size and complexity of data continue to increase, the need for +efficient and effective analysis methods becomes ever more crucial. +Tensorization, the process of converting 2-dimensional datasets into +multidimensional structures, has emerged as a promising approach for multiway +analysis methods. This paper explores the steps involved in tensorization, +multidimensional data sources, various multiway analysis methods employed, and +the benefits of these approaches. A small example of Blind Source Separation +(BSS) is presented comparing 2-dimensional algorithms and a multiway algorithm +in Python. Results indicate that multiway analysis is more expressive. +Additionally, tensorization techniques aid in compressing deep learning models +by reducing the number of required parameters while enhancing the expression of +relationships across dimensions. A survey of the multi-away analysis methods +and integration with various Deep Neural Networks models is presented using +case studies in different domains. + +
+
+ comment: 29 pages, 8 figures, 3 tables +
+
+
+
+
+ + ☆ Cognitive Architectures for Language Agents + + +
+ Recent efforts have incorporated large language models (LLMs) with external +resources (e.g., the Internet) or internal control flows (e.g., prompt +chaining) for tasks requiring grounding or reasoning. However, these efforts +have largely been piecemeal, lacking a systematic framework for constructing a +fully-fledged language agent. To address this challenge, we draw on the rich +history of agent design in symbolic artificial intelligence to develop a +blueprint for a new wave of cognitive language agents. We first show that LLMs +have many of the same properties as production systems, and recent efforts to +improve their grounding or reasoning mirror the development of cognitive +architectures built around production systems. We then propose Cognitive +Architectures for Language Agents (CoALA), a conceptual framework to +systematize diverse methods for LLM-based reasoning, grounding, learning, and +decision making as instantiations of language agents in the framework. Finally, +we use the CoALA framework to highlight gaps and propose actionable directions +toward more capable language agents in the future. + +
+
+ comment: 16 pages of main content, 10 pages of references, 5 figures. Equal + contribution among the first two authors, order decided by coin flip. A + CoALA-based repo of recent work on language agents: + https://github.com/ysymyth/awesome-language-agents +
+
+
+
+
+ + ☆ Monotone Tree-Based GAMI Models by Adapting XGBoost + + +
+ Recent papers have used machine learning architecture to fit low-order +functional ANOVA models with main effects and second-order interactions. These +GAMI (GAM + Interaction) models are directly interpretable as the functional +main effects and interactions can be easily plotted and visualized. +Unfortunately, it is not easy to incorporate the monotonicity requirement into +the existing GAMI models based on boosted trees, such as EBM (Lou et al. 2013) +and GAMI-Lin-T (Hu et al. 2022). This paper considers models of the form +$f(x)=\sum_{j,k}f_{j,k}(x_j, x_k)$ and develops monotone tree-based GAMI +models, called monotone GAMI-Tree, by adapting the XGBoost algorithm. It is +straightforward to fit a monotone model to $f(x)$ using the options in XGBoost. +However, the fitted model is still a black box. We take a different approach: +i) use a filtering technique to determine the important interactions, ii) fit a +monotone XGBoost algorithm with the selected interactions, and finally iii) +parse and purify the results to get a monotone GAMI model. Simulated datasets +are used to demonstrate the behaviors of mono-GAMI-Tree and EBM, both of which +use piecewise constant fits. Note that the monotonicity requirement is for the +full model. Under certain situations, the main effects will also be monotone. +But, as seen in the examples, the interactions will not be monotone. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ On the Minimax Regret in Online Ranking with Top-k Feedback + + +
+ In online ranking, a learning algorithm sequentially ranks a set of items and +receives feedback on its ranking in the form of relevance scores. Since +obtaining relevance scores typically involves human annotation, it is of great +interest to consider a partial feedback setting where feedback is restricted to +the top-$k$ items in the rankings. Chaudhuri and Tewari [2017] developed a +framework to analyze online ranking algorithms with top $k$ feedback. A key +element in their work was the use of techniques from partial monitoring. In +this paper, we further investigate online ranking with top $k$ feedback and +solve some open problems posed by Chaudhuri and Tewari [2017]. We provide a +full characterization of minimax regret rates with the top $k$ feedback model +for all $k$ and for the following ranking performance measures: Pairwise Loss, +Discounted Cumulative Gain, and Precision@n. In addition, we give an efficient +algorithm that achieves the minimax regret rate for Precision@n. + +
+
+
+
+
+ + ☆ Maximum Mean Discrepancy Meets Neural Networks: The + Radon-Kolmogorov-Smirnov Test + + +
+ Maximum mean discrepancy (MMD) refers to a general class of nonparametric +two-sample tests that are based on maximizing the mean difference over samples +from one distribution $P$ versus another $Q$, over all choices of data +transformations $f$ living in some function space $\mathcal{F}$. Inspired by +recent work that connects what are known as functions of $\textit{Radon bounded +variation}$ (RBV) and neural networks (Parhi and Nowak, 2021, 2023), we study +the MMD defined by taking $\mathcal{F}$ to be the unit ball in the RBV space of +a given smoothness order $k \geq 0$. This test, which we refer to as the +$\textit{Radon-Kolmogorov-Smirnov}$ (RKS) test, can be viewed as a +generalization of the well-known and classical Kolmogorov-Smirnov (KS) test to +multiple dimensions and higher orders of smoothness. It is also intimately +connected to neural networks: we prove that the witness in the RKS test -- the +function $f$ achieving the maximum mean difference -- is always a ridge spline +of degree $k$, i.e., a single neuron in a neural network. This allows us to +leverage the power of modern deep learning toolkits to (approximately) optimize +the criterion that underlies the RKS test. We prove that the RKS test has +asymptotically full power at distinguishing any distinct pair $P \not= Q$ of +distributions, derive its asymptotic null distribution, and carry out extensive +experiments to elucidate the strengths and weakenesses of the RKS test versus +the more traditional kernel MMD test. + +
+
+
+
+
+ + ☆ Computing SHAP Efficiently Using Model Structure Information + + +
+ SHAP (SHapley Additive exPlanations) has become a popular method to attribute +the prediction of a machine learning model on an input to its features. One +main challenge of SHAP is the computation time. An exact computation of Shapley +values requires exponential time complexity. Therefore, many approximation +methods are proposed in the literature. In this paper, we propose methods that +can compute SHAP exactly in polynomial time or even faster for SHAP definitions +that satisfy our additivity and dummy assumptions (eg, kernal SHAP and baseline +SHAP). We develop different strategies for models with different levels of +model structure information: known functional decomposition, known order of +model (defined as highest order of interaction in the model), or unknown order. +For the first case, we demonstrate an additive property and a way to compute +SHAP from the lower-order functional components. For the second case, we derive +formulas that can compute SHAP in polynomial time. Both methods yield exact +SHAP results. Finally, if even the order of model is unknown, we propose an +iterative way to approximate Shapley values. The three methods we propose are +computationally efficient when the order of model is not high which is +typically the case in practice. We compare with sampling approach proposed in +Castor & Gomez (2008) using simulation studies to demonstrate the efficacy of +our proposed methods. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ First and zeroth-order implementations of the regularized Newton method + with lazy approximated Hessians + + +
+ In this work, we develop first-order (Hessian-free) and zero-order +(derivative-free) implementations of the Cubically regularized Newton method +for solving general non-convex optimization problems. For that, we employ +finite difference approximations of the derivatives. We use a special adaptive +search procedure in our algorithms, which simultaneously fits both the +regularization constant and the parameters of the finite difference +approximations. It makes our schemes free from the need to know the actual +Lipschitz constants. Additionally, we equip our algorithms with the lazy +Hessian update that reuse a previously computed Hessian approximation matrix +for several iterations. Specifically, we prove the global complexity bound of +$\mathcal{O}( n^{1/2} \epsilon^{-3/2})$ function and gradient evaluations for +our new Hessian-free method, and a bound of $\mathcal{O}( n^{3/2} +\epsilon^{-3/2} )$ function evaluations for the derivative-free method, where +$n$ is the dimension of the problem and $\epsilon$ is the desired accuracy for +the gradient norm. These complexity bounds significantly improve the previously +known ones in terms of the joint dependence on $n$ and $\epsilon$, for the +first-order and zeroth-order non-convex optimization. + +
+
+
+
+
+ + ☆ Delta-LoRA: Fine-Tuning High-Rank Parameters with the Delta of Low-Rank + Matrices + + +
+ In this paper, we present Delta-LoRA, which is a novel parameter-efficient +approach to fine-tune large language models (LLMs). In contrast to LoRA and +other low-rank adaptation methods such as AdaLoRA, Delta-LoRA not only updates +the low-rank matrices $\bA$ and $\bB$, but also propagate the learning to the +pre-trained weights $\bW$ via updates utilizing the delta of the product of two +low-rank matrices ($\bA^{(t+1)}\bB^{(t+1)} - \bA^{(t)}\bB^{(t)}$). Such a +strategy effectively addresses the limitation that the incremental update of +low-rank matrices is inadequate for learning representations capable for +downstream tasks. Moreover, as the update of $\bW$ does not need to compute the +gradients of $\bW$ and store their momentums, Delta-LoRA shares comparable +memory requirements and computational costs with LoRA. Extensive experiments +show that Delta-LoRA significantly outperforms existing low-rank adaptation +methods. We further support these results with comprehensive analyses that +underscore the effectiveness of Delta-LoRA. + +
+
+
+
+
+ + ☆ In-Ear-Voice: Towards Milli-Watt Audio Enhancement With Bone-Conduction + Microphones for In-Ear Sensing Platforms + + +
+ The recent ubiquitous adoption of remote conferencing has been accompanied by +omnipresent frustration with distorted or otherwise unclear voice +communication. Audio enhancement can compensate for low-quality input signals +from, for example, small true wireless earbuds, by applying noise suppression +techniques. Such processing relies on voice activity detection (VAD) with low +latency and the added capability of discriminating the wearer's voice from +others - a task of significant computational complexity. The tight energy +budget of devices as small as modern earphones, however, requires any system +attempting to tackle this problem to do so with minimal power and processing +overhead, while not relying on speaker-specific voice samples and training due +to usability concerns. + This paper presents the design and implementation of a custom research +platform for low-power wireless earbuds based on novel, commercial, MEMS +bone-conduction microphones. Such microphones can record the wearer's speech +with much greater isolation, enabling personalized voice activity detection and +further audio enhancement applications. Furthermore, the paper accurately +evaluates a proposed low-power personalized speech detection algorithm based on +bone conduction data and a recurrent neural network running on the implemented +research platform. This algorithm is compared to an approach based on +traditional microphone input. The performance of the bone conduction system, +achieving detection of speech within 12.8ms at an accuracy of 95\% is +evaluated. Different SoC choices are contrasted, with the final implementation +based on the cutting-edge Ambiq Apollo 4 Blue SoC achieving 2.64mW average +power consumption at 14uJ per inference, reaching 43h of battery life on a +miniature 32mAh li-ion cell and without duty cycling. + +
+
+
+
+
+ + ☆ Explaining grokking through circuit efficiency + + +
+ One of the most surprising puzzles in neural network generalisation is +grokking: a network with perfect training accuracy but poor generalisation +will, upon further training, transition to perfect generalisation. We propose +that grokking occurs when the task admits a generalising solution and a +memorising solution, where the generalising solution is slower to learn but +more efficient, producing larger logits with the same parameter norm. We +hypothesise that memorising circuits become more inefficient with larger +training datasets while generalising circuits do not, suggesting there is a +critical dataset size at which memorisation and generalisation are equally +efficient. We make and confirm four novel predictions about grokking, providing +significant evidence in favour of our explanation. Most strikingly, we +demonstrate two novel and surprising behaviours: ungrokking, in which a network +regresses from perfect to low test accuracy, and semi-grokking, in which a +network shows delayed generalisation to partial rather than perfect test +accuracy. + +
+
+
+
+
+ + ☆ A Lightweight and Transferable Design for Robust LEGO Manipulation + + +
+ LEGO is a well-known platform for prototyping pixelized objects. However, +robotic LEGO prototyping (i.e. manipulating LEGO bricks) is challenging due to +the tight connections and accuracy requirement. This paper investigates safe +and efficient robotic LEGO manipulation. In particular, this paper reduces the +complexity of the manipulation by hardware-software co-design. An end-of-arm +tool (EOAT) is designed, which reduces the problem dimension and allows large +industrial robots to easily manipulate LEGO bricks. In addition, this paper +uses evolution strategy to safely optimize the robot motion for LEGO +manipulation. Experiments demonstrate that the EOAT performs reliably in +manipulating LEGO bricks and the learning framework can effectively and safely +improve the manipulation performance to a 100\% success rate. The co-design is +deployed to multiple robots (i.e. FANUC LR-mate 200id/7L and Yaskawa GP4) to +demonstrate its generalizability and transferability. In the end, we show that +the proposed solution enables sustainable robotic LEGO prototyping, in which +the robot can repeatedly assemble and disassemble different prototypes. + +
+
+
+
+
+ + ☆ Exact Inference for Continuous-Time Gaussian Process Dynamics + + +
+ Physical systems can often be described via a continuous-time dynamical +system. In practice, the true system is often unknown and has to be learned +from measurement data. Since data is typically collected in discrete time, e.g. +by sensors, most methods in Gaussian process (GP) dynamics model learning are +trained on one-step ahead predictions. This can become problematic in several +scenarios, e.g. if measurements are provided at irregularly-sampled time steps +or physical system properties have to be conserved. Thus, we aim for a GP model +of the true continuous-time dynamics. Higher-order numerical integrators +provide the necessary tools to address this problem by discretizing the +dynamics function with arbitrary accuracy. Many higher-order integrators +require dynamics evaluations at intermediate time steps making exact GP +inference intractable. In previous work, this problem is often tackled by +approximating the GP posterior with variational inference. However, exact GP +inference is preferable in many scenarios, e.g. due to its mathematical +guarantees. In order to make direct inference tractable, we propose to leverage +multistep and Taylor integrators. We demonstrate how to derive flexible +inference schemes for these types of integrators. Further, we derive tailored +sampling schemes that allow to draw consistent dynamics functions from the +learned posterior. This is crucial to sample consistent predictions from the +dynamics model. We demonstrate empirically and theoretically that our approach +yields an accurate representation of the continuous-time system. + +
+
+
+
+
+ + ☆ PolyLUT: Learning Piecewise Polynomials for Ultra-Low Latency FPGA + LUT-based Inference + + +
+ Field-programmable gate arrays (FPGAs) are widely used to implement deep +learning inference. Standard deep neural network inference involves the +computation of interleaved linear maps and nonlinear activation functions. +Prior work for ultra-low latency implementations has hardcoded the combination +of linear maps and nonlinear activations inside FPGA lookup tables (LUTs). Our +work is motivated by the idea that the LUTs in an FPGA can be used to implement +a much greater variety of functions than this. In this paper, we propose a +novel approach to training neural networks for FPGA deployment using +multivariate polynomials as the basic building block. Our method takes +advantage of the flexibility offered by the soft logic, hiding the polynomial +evaluation inside the LUTs with zero overhead. We show that by using polynomial +building blocks, we can achieve the same accuracy using considerably fewer +layers of soft logic than by using linear functions, leading to significant +latency and area improvements. We demonstrate the effectiveness of this +approach in three tasks: network intrusion detection, jet identification at the +CERN Large Hadron Collider, and handwritten digit recognition using the MNIST +dataset. + +
+
+
+
+
+ + ☆ Resilient VAE: Unsupervised Anomaly Detection at the SLAC Linac Coherent + Light Source + + +
+ Significant advances in utilizing deep learning for anomaly detection have +been made in recent years. However, these methods largely assume the existence +of a normal training set (i.e., uncontaminated by anomalies) or even a +completely labeled training set. In many complex engineering systems, such as +particle accelerators, labels are sparse and expensive; in order to perform +anomaly detection in these cases, we must drop these assumptions and utilize a +completely unsupervised method. This paper introduces the Resilient Variational +Autoencoder (ResVAE), a deep generative model specifically designed for anomaly +detection. ResVAE exhibits resilience to anomalies present in the training data +and provides feature-level anomaly attribution. During the training process, +ResVAE learns the anomaly probability for each sample as well as each +individual feature, utilizing these probabilities to effectively disregard +anomalous examples in the training data. We apply our proposed method to detect +anomalies in the accelerator status at the SLAC Linac Coherent Light Source +(LCLS). By utilizing shot-to-shot data from the beam position monitoring +system, we demonstrate the exceptional capability of ResVAE in identifying +various types of anomalies that are visible in the accelerator. + +
+
+
+
+
+ + ☆ SeisCLIP: A seismology foundation model pre-trained by multi-modal data + for multi-purpose seismic feature extraction + + +
+ Training specific deep learning models for particular tasks is common across +various domains within seismology. However, this approach encounters two +limitations: inadequate labeled data for certain tasks and limited +generalization across regions. To address these challenges, we develop +SeisCLIP, a seismology foundation model trained through contrastive learning +from multi-modal data. It consists of a transformer encoder for extracting +crucial features from time-frequency seismic spectrum and an MLP encoder for +integrating the phase and source information of the same event. These encoders +are jointly pre-trained on a vast dataset and the spectrum encoder is +subsequently fine-tuned on smaller datasets for various downstream tasks. +Notably, SeisCLIP's performance surpasses that of baseline methods in event +classification, localization, and focal mechanism analysis tasks, employing +distinct datasets from different regions. In conclusion, SeisCLIP holds +significant potential as a foundational model in the field of seismology, +paving the way for innovative directions in foundation-model-based seismology +research. + +
+
+ comment: 27 pages, 9 figures, 4 tables +
+
+
+
+
+ + ☆ A study on the impact of pre-trained model on Just-In-Time defect + prediction + + +
+ Previous researchers conducting Just-In-Time (JIT) defect prediction tasks +have primarily focused on the performance of individual pre-trained models, +without exploring the relationship between different pre-trained models as +backbones. In this study, we build six models: RoBERTaJIT, CodeBERTJIT, +BARTJIT, PLBARTJIT, GPT2JIT, and CodeGPTJIT, each with a distinct pre-trained +model as its backbone. We systematically explore the differences and +connections between these models. Specifically, we investigate the performance +of the models when using Commit code and Commit message as inputs, as well as +the relationship between training efficiency and model distribution among these +six models. Additionally, we conduct an ablation experiment to explore the +sensitivity of each model to inputs. Furthermore, we investigate how the models +perform in zero-shot and few-shot scenarios. Our findings indicate that each +model based on different backbones shows improvements, and when the backbone's +pre-training model is similar, the training resources that need to be consumed +are much more closer. We also observe that Commit code plays a significant role +in defect detection, and different pre-trained models demonstrate better defect +detection ability with a balanced dataset under few-shot scenarios. These +results provide new insights for optimizing JIT defect prediction tasks using +pre-trained models and highlight the factors that require more attention when +constructing such models. Additionally, CodeGPTJIT and GPT2JIT achieved better +performance than DeepJIT and CC2Vec on the two datasets respectively under 2000 +training samples. These findings emphasize the effectiveness of +transformer-based pre-trained models in JIT defect prediction tasks, especially +in scenarios with limited training data. + +
+
+
+
+
+ + ☆ Graph Self-Contrast Representation Learning ICDM 2023 + + +
+ Graph contrastive learning (GCL) has recently emerged as a promising approach +for graph representation learning. Some existing methods adopt the 1-vs-K +scheme to construct one positive and K negative samples for each graph, but it +is difficult to set K. For those methods that do not use negative samples, it +is often necessary to add additional strategies to avoid model collapse, which +could only alleviate the problem to some extent. All these drawbacks will +undoubtedly have an adverse impact on the generalizability and efficiency of +the model. In this paper, to address these issues, we propose a novel graph +self-contrast framework GraphSC, which only uses one positive and one negative +sample, and chooses triplet loss as the objective. Specifically, self-contrast +has two implications. First, GraphSC generates both positive and negative views +of a graph sample from the graph itself via graph augmentation functions of +various intensities, and use them for self-contrast. Second, GraphSC uses +Hilbert-Schmidt Independence Criterion (HSIC) to factorize the representations +into multiple factors and proposes a masked self-contrast mechanism to better +separate positive and negative samples. Further, Since the triplet loss only +optimizes the relative distance between the anchor and its positive/negative +samples, it is difficult to ensure the absolute distance between the anchor and +positive sample. Therefore, we explicitly reduced the absolute distance between +the anchor and positive sample to accelerate convergence. Finally, we conduct +extensive experiments to evaluate the performance of GraphSC against 19 other +state-of-the-art methods in both unsupervised and transfer learning settings. + +
+
+ comment: ICDM 2023(Regular) +
+
+
+
+
+ + ☆ Inferring effective couplings with Restricted Boltzmann Machines + + +
+ Generative models offer a direct way to model complex data. Among them, +energy-based models provide us with a neural network model that aims to +accurately reproduce all statistical correlations observed in the data at the +level of the Boltzmann weight of the model. However, one challenge is to +understand the physical interpretation of such models. In this study, we +propose a simple solution by implementing a direct mapping between the energy +function of the Restricted Boltzmann Machine and an effective Ising spin +Hamiltonian that includes high-order interactions between spins. This mapping +includes interactions of all possible orders, going beyond the conventional +pairwise interactions typically considered in the inverse Ising approach, and +allowing the description of complex datasets. Earlier work attempted to achieve +this goal, but the proposed mappings did not do properly treat the complexity +of the problem or did not contain direct prescriptions for practical +application. To validate our method, we perform several controlled numerical +experiments where the training samples are equilibrium samples of predefined +models containing local external fields, two-body and three-body interactions +in various low-dimensional topologies. The results demonstrate the +effectiveness of our proposed approach in learning the correct interaction +network and pave the way for its application in modeling interesting datasets. +We also evaluate the quality of the inferred model based on different training +methods. + +
+
+ comment: 16 figures, 22 pages +
+
+
+
+
+ + ☆ Haystack: A Panoptic Scene Graph Dataset to Evaluate Rare Predicate + Classes + + +
+ Current scene graph datasets suffer from strong long-tail distributions of +their predicate classes. Due to a very low number of some predicate classes in +the test sets, no reliable metrics can be retrieved for the rarest classes. We +construct a new panoptic scene graph dataset and a set of metrics that are +designed as a benchmark for the predictive performance especially on rare +predicate classes. To construct the new dataset, we propose a model-assisted +annotation pipeline that efficiently finds rare predicate classes that are +hidden in a large set of images like needles in a haystack. + Contrary to prior scene graph datasets, Haystack contains explicit negative +annotations, i.e. annotations that a given relation does not have a certain +predicate class. Negative annotations are helpful especially in the field of +scene graph generation and open up a whole new set of possibilities to improve +current scene graph generation models. + Haystack is 100% compatible with existing panoptic scene graph datasets and +can easily be integrated with existing evaluation pipelines. Our dataset and +code can be found here: https://lorjul.github.io/haystack/. It includes +annotation files and simple to use scripts and utilities, to help with +integrating our dataset in existing work. + +
+
+
+
+
+ + ☆ PromptTTS 2: Describing and Generating Voices with Text Prompt + + +
+ Speech conveys more information than just text, as the same word can be +uttered in various voices to convey diverse information. Compared to +traditional text-to-speech (TTS) methods relying on speech prompts (reference +speech) for voice variability, using text prompts (descriptions) is more +user-friendly since speech prompts can be hard to find or may not exist at all. +TTS approaches based on the text prompt face two challenges: 1) the one-to-many +problem, where not all details about voice variability can be described in the +text prompt, and 2) the limited availability of text prompt datasets, where +vendors and large cost of data labeling are required to write text prompt for +speech. In this work, we introduce PromptTTS 2 to address these challenges with +a variation network to provide variability information of voice not captured by +text prompts, and a prompt generation pipeline to utilize the large language +models (LLM) to compose high quality text prompts. Specifically, the variation +network predicts the representation extracted from the reference speech (which +contains full information about voice) based on the text prompt representation. +For the prompt generation pipeline, it generates text prompts for speech with a +speech understanding model to recognize voice attributes (e.g., gender, speed) +from speech and a large language model to formulate text prompt based on the +recognition results. Experiments on a large-scale (44K hours) speech dataset +demonstrate that compared to the previous works, PromptTTS 2 generates voices +more consistent with text prompts and supports the sampling of diverse voice +variability, thereby offering users more choices on voice generation. +Additionally, the prompt generation pipeline produces high-quality prompts, +eliminating the large labeling cost. The demo page of PromptTTS 2 is available +online\footnote{https://speechresearch.github.io/prompttts2}. + +
+
+ comment: Demo page: https://speechresearch.github.io/prompttts2 +
+
+
+
+
+ + ☆ s-ID: Causal Effect Identification in a Sub-Population + + +
+ Causal inference in a sub-population involves identifying the causal effect +of an intervention on a specific subgroup within a larger population. However, +ignoring the subtleties introduced by sub-populations can either lead to +erroneous inference or limit the applicability of existing methods. We +introduce and advocate for a causal inference problem in sub-populations +(henceforth called s-ID), in which we merely have access to observational data +of the targeted sub-population (as opposed to the entire population). Existing +inference problems in sub-populations operate on the premise that the given +data distributions originate from the entire population, thus, cannot tackle +the s-ID problem. To address this gap, we provide necessary and sufficient +conditions that must hold in the causal graph for a causal effect in a +sub-population to be identifiable from the observational distribution of that +sub-population. Given these conditions, we present a sound and complete +algorithm for the s-ID problem. + +
+
+ comment: 22 pages, 14 figures, 1 table +
+
+
+
+
+ + ☆ A Comparison of Residual-based Methods on Fault Detection + + +
+ An important initial step in fault detection for complex industrial systems +is gaining an understanding of their health condition. Subsequently, continuous +monitoring of this health condition becomes crucial to observe its evolution, +track changes over time, and isolate faults. As faults are typically rare +occurrences, it is essential to perform this monitoring in an unsupervised +manner. Various approaches have been proposed not only to detect faults in an +unsupervised manner but also to distinguish between different potential fault +types. In this study, we perform a comprehensive comparison between two +residual-based approaches: autoencoders, and the input-output models that +establish a mapping between operating conditions and sensor readings. We +explore the sensor-wise residuals and aggregated residuals for the entire +system in both methods. The performance evaluation focuses on three tasks: +health indicator construction, fault detection, and health indicator +interpretation. To perform the comparison, we utilize the Commercial Modular +Aero-Propulsion System Simulation (C-MAPSS) dynamical model, specifically a +subset of the turbofan engine dataset containing three different fault types. +All models are trained exclusively on healthy data. Fault detection is achieved +by applying a threshold that is determined based on the healthy condition. The +detection results reveal that both models are capable of detecting faults with +an average delay of around 20 cycles and maintain a low false positive rate. +While the fault detection performance is similar for both models, the +input-output model provides better interpretability regarding potential fault +types and the possible faulty components. + +
+
+ comment: 10 pages, submitted to the 15th Annual Conference of the Prognostics + and Health Management Society +
+
+
+
+
+ + ☆ Graph-Based Automatic Feature Selection for Multi-Class Classification + via Mean Simplified Silhouette + + +
+ This paper introduces a novel graph-based filter method for automatic feature +selection (abbreviated as GB-AFS) for multi-class classification tasks. The +method determines the minimum combination of features required to sustain +prediction performance while maintaining complementary discriminating abilities +between different classes. It does not require any user-defined parameters such +as the number of features to select. The methodology employs the +Jeffries-Matusita (JM) distance in conjunction with t-distributed Stochastic +Neighbor Embedding (t-SNE) to generate a low-dimensional space reflecting how +effectively each feature can differentiate between each pair of classes. The +minimum number of features is selected using our newly developed Mean +Simplified Silhouette (abbreviated as MSS) index, designed to evaluate the +clustering results for the feature selection task. Experimental results on +public data sets demonstrate the superior performance of the proposed GB-AFS +over other filter-based techniques and automatic feature selection approaches. +Moreover, the proposed algorithm maintained the accuracy achieved when +utilizing all features, while using only $7\%$ to $30\%$ of the features. +Consequently, this resulted in a reduction of the time needed for +classifications, from $15\%$ to $70\%$. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ MA-VAE: Multi-head Attention-based Variational Autoencoder Approach for + Anomaly Detection in Multivariate Time-series Applied to Automotive Endurance + Powertrain Testing + + +
+ A clear need for automatic anomaly detection applied to automotive testing +has emerged as more and more attention is paid to the data recorded and manual +evaluation by humans reaches its capacity. Such real-world data is massive, +diverse, multivariate and temporal in nature, therefore requiring modelling of +the testee behaviour. We propose a variational autoencoder with multi-head +attention (MA-VAE), which, when trained on unlabelled data, not only provides +very few false positives but also manages to detect the majority of the +anomalies presented. In addition to that, the approach offers a novel way to +avoid the bypass phenomenon, an undesirable behaviour investigated in +literature. Lastly, the approach also introduces a new method to remap +individual windows to a continuous time series. The results are presented in +the context of a real-world industrial data set and several experiments are +undertaken to further investigate certain aspects of the proposed model. When +configured properly, it is 9% of the time wrong when an anomaly is flagged and +discovers 67% of the anomalies present. Also, MA-VAE has the potential to +perform well with only a fraction of the training and validation subset, +however, to extract it, a more sophisticated threshold estimation method is +required. + +
+
+ comment: Accepted in NCTA2023 +
+
+
+
+
+ + ☆ RoBoSS: A Robust, Bounded, Sparse, and Smooth Loss Function for + Supervised Learning + + +
+ In the domain of machine learning algorithms, the significance of the loss +function is paramount, especially in supervised learning tasks. It serves as a +fundamental pillar that profoundly influences the behavior and efficacy of +supervised learning algorithms. Traditional loss functions, while widely used, +often struggle to handle noisy and high-dimensional data, impede model +interpretability, and lead to slow convergence during training. In this paper, +we address the aforementioned constraints by proposing a novel robust, bounded, +sparse, and smooth (RoBoSS) loss function for supervised learning. Further, we +incorporate the RoBoSS loss function within the framework of support vector +machine (SVM) and introduce a new robust algorithm named +$\mathcal{L}_{rbss}$-SVM. For the theoretical analysis, the +classification-calibrated property and generalization ability are also +presented. These investigations are crucial for gaining deeper insights into +the performance of the RoBoSS loss function in the classification tasks and its +potential to generalize well to unseen data. To empirically demonstrate the +effectiveness of the proposed $\mathcal{L}_{rbss}$-SVM, we evaluate it on $88$ +real-world UCI and KEEL datasets from diverse domains. Additionally, to +exemplify the effectiveness of the proposed $\mathcal{L}_{rbss}$-SVM within the +biomedical realm, we evaluated it on two medical datasets: the +electroencephalogram (EEG) signal dataset and the breast cancer (BreaKHis) +dataset. The numerical results substantiate the superiority of the proposed +$\mathcal{L}_{rbss}$-SVM model, both in terms of its remarkable generalization +performance and its efficiency in training time. + +
+
+
+
+
+ + ☆ Encoding Seasonal Climate Predictions for Demand Forecasting with + Modular Neural Network + + +
+ Current time-series forecasting problems use short-term weather attributes as +exogenous inputs. However, in specific time-series forecasting solutions (e.g., +demand prediction in the supply chain), seasonal climate predictions are +crucial to improve its resilience. Representing mid to long-term seasonal +climate forecasts is challenging as seasonal climate predictions are uncertain, +and encoding spatio-temporal relationship of climate forecasts with demand is +complex. + We propose a novel modeling framework that efficiently encodes seasonal +climate predictions to provide robust and reliable time-series forecasting for +supply chain functions. The encoding framework enables effective learning of +latent representations -- be it uncertain seasonal climate prediction or other +time-series data (e.g., buyer patterns) -- via a modular neural network +architecture. Our extensive experiments indicate that learning such +representations to model seasonal climate forecast results in an error +reduction of approximately 13\% to 17\% across multiple real-world data sets +compared to existing demand forecasting methods. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Self-Similarity-Based and Novelty-based loss for music structure + analysis + + +
+ Music Structure Analysis (MSA) is the task aiming at identifying musical +segments that compose a music track and possibly label them based on their +similarity. In this paper we propose a supervised approach for the task of +music boundary detection. In our approach we simultaneously learn features and +convolution kernels. For this we jointly optimize -- a loss based on the +Self-Similarity-Matrix (SSM) obtained with the learned features, denoted by +SSM-loss, and -- a loss based on the novelty score obtained applying the +learned kernels to the estimated SSM, denoted by novelty-loss. We also +demonstrate that relative feature learning, through self-attention, is +beneficial for the task of MSA. Finally, we compare the performances of our +approach to previously proposed approaches on the standard RWC-Pop, and various +subsets of SALAMI. + +
+
+
+
+
+ + ☆ Sample Size in Natural Language Processing within Healthcare Research + + +
+ Sample size calculation is an essential step in most data-based disciplines. +Large enough samples ensure representativeness of the population and determine +the precision of estimates. This is true for most quantitative studies, +including those that employ machine learning methods, such as natural language +processing, where free-text is used to generate predictions and classify +instances of text. Within the healthcare domain, the lack of sufficient corpora +of previously collected data can be a limiting factor when determining sample +sizes for new studies. This paper tries to address the issue by making +recommendations on sample sizes for text classification tasks in the healthcare +domain. + Models trained on the MIMIC-III database of critical care records from Beth +Israel Deaconess Medical Center were used to classify documents as having or +not having Unspecified Essential Hypertension, the most common diagnosis code +in the database. Simulations were performed using various classifiers on +different sample sizes and class proportions. This was repeated for a +comparatively less common diagnosis code within the database of diabetes +mellitus without mention of complication. + Smaller sample sizes resulted in better results when using a K-nearest +neighbours classifier, whereas larger sample sizes provided better results with +support vector machines and BERT models. Overall, a sample size larger than +1000 was sufficient to provide decent performance metrics. + The simulations conducted within this study provide guidelines that can be +used as recommendations for selecting appropriate sample sizes and class +proportions, and for predicting expected performance, when building classifiers +for textual healthcare data. The methodology used here can be modified for +sample size estimates calculations with other datasets. + +
+
+ comment: Submitted to Journal of Biomedical Informatics +
+
+
+
+
+ + ☆ Distributionally Robust Model-based Reinforcement Learning with Large + State Spaces + + +
+ Three major challenges in reinforcement learning are the complex dynamical +systems with large state spaces, the costly data acquisition processes, and the +deviation of real-world dynamics from the training environment deployment. To +overcome these issues, we study distributionally robust Markov decision +processes with continuous state spaces under the widely used Kullback-Leibler, +chi-square, and total variation uncertainty sets. We propose a model-based +approach that utilizes Gaussian Processes and the maximum variance reduction +algorithm to efficiently learn multi-output nominal transition dynamics, +leveraging access to a generative model (i.e., simulator). We further +demonstrate the statistical sample complexity of the proposed method for +different uncertainty sets. These complexity bounds are independent of the +number of states and extend beyond linear dynamics, ensuring the effectiveness +of our approach in identifying near-optimal distributionally-robust policies. +The proposed method can be further combined with other model-free +distributionally robust reinforcement learning methods to obtain a near-optimal +robust policy. Experimental results demonstrate the robustness of our algorithm +to distributional shifts and its superior performance in terms of the number of +samples needed. + +
+
+
+
+
+ + ☆ Improving equilibrium propagation without weight symmetry through + Jacobian homeostasis + + +
+ Equilibrium propagation (EP) is a compelling alternative to the +backpropagation of error algorithm (BP) for computing gradients of neural +networks on biological or analog neuromorphic substrates. Still, the algorithm +requires weight symmetry and infinitesimal equilibrium perturbations, i.e., +nudges, to estimate unbiased gradients efficiently. Both requirements are +challenging to implement in physical systems. Yet, whether and how weight +asymmetry affects its applicability is unknown because, in practice, it may be +masked by biases introduced through the finite nudge. To address this question, +we study generalized EP, which can be formulated without weight symmetry, and +analytically isolate the two sources of bias. For complex-differentiable +non-symmetric networks, we show that the finite nudge does not pose a problem, +as exact derivatives can still be estimated via a Cauchy integral. In contrast, +weight asymmetry introduces bias resulting in low task performance due to poor +alignment of EP's neuronal error vectors compared to BP. To mitigate this +issue, we present a new homeostatic objective that directly penalizes +functional asymmetries of the Jacobian at the network's fixed point. This +homeostatic objective dramatically improves the network's ability to solve +complex tasks such as ImageNet 32x32. Our results lay the theoretical +groundwork for studying and mitigating the adverse effects of imperfections of +physical networks on learning algorithms that rely on the substrate's +relaxation dynamics. + +
+
+
+
+
+ + ☆ Distributionally Robust Machine Learning with Multi-source Data + + +
+ Classical machine learning methods may lead to poor prediction performance +when the target distribution differs from the source populations. This paper +utilizes data from multiple sources and introduces a group distributionally +robust prediction model defined to optimize an adversarial reward about +explained variance with respect to a class of target distributions. Compared to +classical empirical risk minimization, the proposed robust prediction model +improves the prediction accuracy for target populations with distribution +shifts. We show that our group distributionally robust prediction model is a +weighted average of the source populations' conditional outcome models. We +leverage this key identification result to robustify arbitrary machine learning +algorithms, including, for example, random forests and neural networks. We +devise a novel bias-corrected estimator to estimate the optimal aggregation +weight for general machine-learning algorithms and demonstrate its improvement +in the convergence rate. Our proposal can be seen as a distributionally robust +federated learning approach that is computationally efficient and easy to +implement using arbitrary machine learning base algorithms, satisfies some +privacy constraints, and has a nice interpretation of different sources' +importance for predicting a given target covariate distribution. We demonstrate +the performance of our proposed group distributionally robust method on +simulated and real data with random forests and neural networks as +base-learning algorithms. + +
+
+
+
+
+ + ☆ Language Models for Novelty Detection in System Call Traces + + +
+ Due to the complexity of modern computer systems, novel and unexpected +behaviors frequently occur. Such deviations are either normal occurrences, such +as software updates and new user activities, or abnormalities, such as +misconfigurations, latency issues, intrusions, and software bugs. Regardless, +novel behaviors are of great interest to developers, and there is a genuine +need for efficient and effective methods to detect them. Nowadays, researchers +consider system calls to be the most fine-grained and accurate source of +information to investigate the behavior of computer systems. Accordingly, this +paper introduces a novelty detection methodology that relies on a probability +distribution over sequences of system calls, which can be seen as a language +model. Language models estimate the likelihood of sequences, and since +novelties deviate from previously observed behaviors by definition, they would +be unlikely under the model. Following the success of neural networks for +language models, three architectures are evaluated in this work: the widespread +LSTM, the state-of-the-art Transformer, and the lower-complexity Longformer. +However, large neural networks typically require an enormous amount of data to +be trained effectively, and to the best of our knowledge, no massive modern +datasets of kernel traces are publicly available. This paper addresses this +limitation by introducing a new open-source dataset of kernel traces comprising +over 2 million web requests with seven distinct behaviors. The proposed +methodology requires minimal expert hand-crafting and achieves an F-score and +AuROC greater than 95% on most novelties while being data- and task-agnostic. +The source code and trained models are publicly available on GitHub while the +datasets are available on Zenodo. + +
+
+ comment: 12 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ On the Complexity of Differentially Private Best-Arm Identification with + Fixed Confidence + + +
+ Best Arm Identification (BAI) problems are progressively used for +data-sensitive applications, such as designing adaptive clinical trials, tuning +hyper-parameters, and conducting user studies to name a few. Motivated by the +data privacy concerns invoked by these applications, we study the problem of +BAI with fixed confidence under $\epsilon$-global Differential Privacy (DP). +First, to quantify the cost of privacy, we derive a lower bound on the sample +complexity of any $\delta$-correct BAI algorithm satisfying $\epsilon$-global +DP. Our lower bound suggests the existence of two privacy regimes depending on +the privacy budget $\epsilon$. In the high-privacy regime (small $\epsilon$), +the hardness depends on a coupled effect of privacy and a novel +information-theoretic quantity, called the Total Variation Characteristic Time. +In the low-privacy regime (large $\epsilon$), the sample complexity lower bound +reduces to the classical non-private lower bound. Second, we propose AdaP-TT, +an $\epsilon$-global DP variant of the Top Two algorithm. AdaP-TT runs in +arm-dependent adaptive episodes and adds Laplace noise to ensure a good +privacy-utility trade-off. We derive an asymptotic upper bound on the sample +complexity of AdaP-TT that matches with the lower bound up to multiplicative +constants in the high-privacy regime. Finally, we provide an experimental +analysis of AdaP-TT that validates our theoretical results. + +
+
+
+
+
+ + ☆ Sparse Function-space Representation of Neural Networks ICML 2023 + + +
+ Deep neural networks (NNs) are known to lack uncertainty estimates and +struggle to incorporate new data. We present a method that mitigates these +issues by converting NNs from weight space to function space, via a dual +parameterization. Importantly, the dual parameterization enables us to +formulate a sparse representation that captures information from the entire +data set. This offers a compact and principled way of capturing uncertainty and +enables us to incorporate new data without retraining whilst retaining +predictive performance. We provide proof-of-concept demonstrations with the +proposed approach for quantifying uncertainty in supervised learning on UCI +benchmark tasks. + +
+
+ comment: Accepted to ICML 2023 Workshop on Duality for Modern Machine + Learning, Honolulu, Hawaii, USA. 4 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ Personalized Federated Deep Reinforcement Learning-based Trajectory + Optimization for Multi-UAV Assisted Edge Computing + + +
+ In the era of 5G mobile communication, there has been a significant surge in +research focused on unmanned aerial vehicles (UAVs) and mobile edge computing +technology. UAVs can serve as intelligent servers in edge computing +environments, optimizing their flight trajectories to maximize communication +system throughput. Deep reinforcement learning (DRL)-based trajectory +optimization algorithms may suffer from poor training performance due to +intricate terrain features and inadequate training data. To overcome this +limitation, some studies have proposed leveraging federated learning (FL) to +mitigate the data isolation problem and expedite convergence. Nevertheless, the +efficacy of global FL models can be negatively impacted by the high +heterogeneity of local data, which could potentially impede the training +process and even compromise the performance of local agents. This work proposes +a novel solution to address these challenges, namely personalized federated +deep reinforcement learning (PF-DRL), for multi-UAV trajectory optimization. +PF-DRL aims to develop individualized models for each agent to address the data +scarcity issue and mitigate the negative impact of data heterogeneity. +Simulation results demonstrate that the proposed algorithm achieves superior +training performance with faster convergence rates, and improves service +quality compared to other DRL-based approaches. + +
+
+
+
+
+ + ☆ Leveraging BERT Language Models for Multi-Lingual ESG Issue + Identification + + +
+ Environmental, Social, and Governance (ESG) has been used as a metric to +measure the negative impacts and enhance positive outcomes of companies in +areas such as the environment, society, and governance. Recently, investors +have increasingly recognized the significance of ESG criteria in their +investment choices, leading businesses to integrate ESG principles into their +operations and strategies. The Multi-Lingual ESG Issue Identification (ML-ESG) +shared task encompasses the classification of news documents into 35 distinct +ESG issue labels. In this study, we explored multiple strategies harnessing +BERT language models to achieve accurate classification of news documents +across these labels. Our analysis revealed that the RoBERTa classifier emerged +as one of the most successful approaches, securing the second-place position +for the English test dataset, and sharing the fifth-place position for the +French test dataset. Furthermore, our SVM-based binary model tailored for the +Chinese language exhibited exceptional performance, earning the second-place +rank on the test dataset. + +
+
+
+
+
+ + ☆ Bias Propagation in Federated Learning + + +
+ We show that participating in federated learning can be detrimental to group +fairness. In fact, the bias of a few parties against under-represented groups +(identified by sensitive attributes such as gender or race) can propagate +through the network to all the parties in the network. We analyze and explain +bias propagation in federated learning on naturally partitioned real-world +datasets. Our analysis reveals that biased parties unintentionally yet +stealthily encode their bias in a small number of model parameters, and +throughout the training, they steadily increase the dependence of the global +model on sensitive attributes. What is important to highlight is that the +experienced bias in federated learning is higher than what parties would +otherwise encounter in centralized training with a model trained on the union +of all their data. This indicates that the bias is due to the algorithm. Our +work calls for auditing group fairness in federated learning and designing +learning algorithms that are robust to bias propagation. + +
+
+
+
+
+ + ☆ Model-based Offline Policy Optimization with Adversarial Network ECAI + 2023 + + +
+ Model-based offline reinforcement learning (RL), which builds a supervised +transition model with logging dataset to avoid costly interactions with the +online environment, has been a promising approach for offline policy +optimization. As the discrepancy between the logging data and online +environment may result in a distributional shift problem, many prior works have +studied how to build robust transition models conservatively and estimate the +model uncertainty accurately. However, the over-conservatism can limit the +exploration of the agent, and the uncertainty estimates may be unreliable. In +this work, we propose a novel Model-based Offline policy optimization framework +with Adversarial Network (MOAN). The key idea is to use adversarial learning to +build a transition model with better generalization, where an adversary is +introduced to distinguish between in-distribution and out-of-distribution +samples. Moreover, the adversary can naturally provide a quantification of the +model's uncertainty with theoretical guarantees. Extensive experiments showed +that our approach outperforms existing state-of-the-art baselines on widely +studied offline RL benchmarks. It can also generate diverse in-distribution +samples, and quantify the uncertainty more accurately. + +
+
+ comment: Accepted by 26th European Conference on Artificial Intelligence ECAI + 2023 +
+
+
+
+
+ + ☆ Making Large Language Models Better Reasoners with Alignment + + +
+ Reasoning is a cognitive process of using evidence to reach a sound +conclusion. The reasoning capability is essential for large language models +(LLMs) to serve as the brain of the artificial general intelligence agent. +Recent studies reveal that fine-tuning LLMs on data with the chain of thought +(COT) reasoning process can significantly enhance their reasoning capabilities. +However, we find that the fine-tuned LLMs suffer from an \textit{Assessment +Misalignment} problem, i.e., they frequently assign higher scores to subpar +COTs, leading to potential limitations in their reasoning abilities. To address +this problem, we introduce an \textit{Alignment Fine-Tuning (AFT)} paradigm, +which involves three steps: 1) fine-tuning LLMs with COT training data; 2) +generating multiple COT responses for each question, and categorizing them into +positive and negative ones based on whether they achieve the correct answer; 3) +calibrating the scores of positive and negative responses given by LLMs with a +novel constraint alignment loss. Specifically, the constraint alignment loss +has two objectives: a) Alignment, which guarantees that positive scores surpass +negative scores to encourage answers with high-quality COTs; b) Constraint, +which keeps the negative scores confined to a reasonable range to prevent the +model degradation. Beyond just the binary positive and negative feedback, the +constraint alignment loss can be seamlessly adapted to the ranking situations +when ranking feedback is accessible. Furthermore, we also delve deeply into +recent ranking-based alignment methods, such as DPO, RRHF, and PRO, and +discover that the constraint, which has been overlooked by these approaches, is +also crucial for their performance. Extensive experiments on four reasoning +benchmarks with both binary and ranking feedback demonstrate the effectiveness +of AFT. + +
+
+ comment: Large Language Models; Reasoning; Alignment +
+
+
+
+
+ + ☆ A Lightweight, Rapid and Efficient Deep Convolutional Network for Chest + X-Ray Tuberculosis Detection + + +
+ Tuberculosis (TB) is still recognized as one of the leading causes of death +worldwide. Recent advances in deep learning (DL) have shown to enhance +radiologists' ability to interpret chest X-ray (CXR) images accurately and with +fewer errors, leading to a better diagnosis of this disease. However, little +work has been done to develop models capable of diagnosing TB that offer good +performance while being efficient, fast and computationally inexpensive. In +this work, we propose LightTBNet, a novel lightweight, fast and efficient deep +convolutional network specially customized to detect TB from CXR images. Using +a total of 800 frontal CXR images from two publicly available datasets, our +solution yielded an accuracy, F1 and area under the ROC curve (AUC) of 0.906, +0.907 and 0.961, respectively, on an independent test subset. The proposed +model demonstrates outstanding performance while delivering a rapid prediction, +with minimal computational and memory requirements, making it highly suitable +for deployment in handheld devices that can be used in low-resource areas with +high TB prevalence. Code publicly available at +https://github.com/dani-capellan/LightTBNet. + +
+
+ comment: 5 pages, 3 figures, 3 tables. This paper has been accepted at ISBI + 2023 +
+
+
+
+
+ + ☆ Generalized Simplicial Attention Neural Networks + + +
+ The aim of this work is to introduce Generalized Simplicial Attention Neural +Networks (GSANs), i.e., novel neural architectures designed to process data +defined on simplicial complexes using masked self-attentional layers. Hinging +on topological signal processing principles, we devise a series of +self-attention schemes capable of processing data components defined at +different simplicial orders, such as nodes, edges, triangles, and beyond. These +schemes learn how to weight the neighborhoods of the given topological domain +in a task-oriented fashion, leveraging the interplay among simplices of +different orders through the Dirac operator and its Dirac decomposition. We +also theoretically establish that GSANs are permutation equivariant and +simplicial-aware. Finally, we illustrate how our approach compares favorably +with other methods when applied to several (inductive and transductive) tasks +such as trajectory prediction, missing data imputation, graph classification, +and simplex prediction. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2203.07485 +
+
+
+
+
+ + ☆ A Simple Asymmetric Momentum Make SGD Greatest Again + + +
+ We propose the simplest SGD enhanced method ever, Loss-Controlled Asymmetric +Momentum(LCAM), aimed directly at the Saddle Point problem. Compared to the +traditional SGD with Momentum, there's no increase in computational demand, yet +it outperforms all current optimizers. We use the concepts of weight +conjugation and traction effect to explain this phenomenon. We designed +experiments to rapidly reduce the learning rate at specified epochs to trap +parameters more easily at saddle points. We selected WRN28-10 as the test +network and chose cifar10 and cifar100 as test datasets, an identical group to +the original paper of WRN and Cosine Annealing Scheduling(CAS). We compared the +ability to bypass saddle points of Asymmetric Momentum with different +priorities. Finally, using WRN28-10 on Cifar100, we achieved a peak average +test accuracy of 80.78\% around 120 epoch. For comparison, the original WRN +paper reported 80.75\%, while CAS was at 80.42\%, all at 200 epoch. This means +that while potentially increasing accuracy, we use nearly half convergence +time. Our demonstration code is available at\\ +https://github.com/hakumaicc/Asymmetric-Momentum-LCAM + +
+
+
+
+
+ + ☆ Exploiting Spatial-temporal Data for Sleep Stage Classification via + Hypergraph Learning + + +
+ Sleep stage classification is crucial for detecting patients' health +conditions. Existing models, which mainly use Convolutional Neural Networks +(CNN) for modelling Euclidean data and Graph Convolution Networks (GNN) for +modelling non-Euclidean data, are unable to consider the heterogeneity and +interactivity of multimodal data as well as the spatial-temporal correlation +simultaneously, which hinders a further improvement of classification +performance. In this paper, we propose a dynamic learning framework STHL, which +introduces hypergraph to encode spatial-temporal data for sleep stage +classification. Hypergraphs can construct multi-modal/multi-type data instead +of using simple pairwise between two subjects. STHL creates spatial and +temporal hyperedges separately to build node correlations, then it conducts +type-specific hypergraph learning process to encode the attributes into the +embedding space. Extensive experiments show that our proposed STHL outperforms +the state-of-the-art models in sleep stage classification tasks. + +
+
+
+
+
+ + ☆ Leveraging Label Information for Multimodal Emotion Recognition + + +
+ Multimodal emotion recognition (MER) aims to detect the emotional status of a +given expression by combining the speech and text information. Intuitively, +label information should be capable of helping the model locate the salient +tokens/frames relevant to the specific emotion, which finally facilitates the +MER task. Inspired by this, we propose a novel approach for MER by leveraging +label information. Specifically, we first obtain the representative label +embeddings for both text and speech modalities, then learn the label-enhanced +text/speech representations for each utterance via label-token and label-frame +interactions. Finally, we devise a novel label-guided attentive fusion module +to fuse the label-aware text and speech representations for emotion +classification. Extensive experiments were conducted on the public IEMOCAP +dataset, and experimental results demonstrate that our proposed approach +outperforms existing baselines and achieves new state-of-the-art performance. + +
+
+ comment: Accepted by Interspeech 2023 +
+
+
+
+
+ + ☆ Iterative Superquadric Recomposition of 3D Objects from Multiple Views ICCV 2023 + + +
+ Humans are good at recomposing novel objects, i.e. they can identify +commonalities between unknown objects from general structure to finer detail, +an ability difficult to replicate by machines. We propose a framework, ISCO, to +recompose an object using 3D superquadrics as semantic parts directly from 2D +views without training a model that uses 3D supervision. To achieve this, we +optimize the superquadric parameters that compose a specific instance of the +object, comparing its rendered 3D view and 2D image silhouette. Our ISCO +framework iteratively adds new superquadrics wherever the reconstruction error +is high, abstracting first coarse regions and then finer details of the target +object. With this simple coarse-to-fine inductive bias, ISCO provides +consistent superquadrics for related object parts, despite not having any +semantic supervision. Since ISCO does not train any neural network, it is also +inherently robust to out-of-distribution objects. Experiments show that, +compared to recent single instance superquadrics reconstruction approaches, +ISCO provides consistently more accurate 3D reconstructions, even from images +in the wild. Code available at https://github.com/ExplainableML/ISCO . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ TensorBank:Tensor Lakehouse for Foundation Model Training + + +
+ Storing and streaming high dimensional data for foundation model training +became a critical requirement with the rise of foundation models beyond natural +language. In this paper we introduce TensorBank, a petabyte scale tensor +lakehouse capable of streaming tensors from Cloud Object Store (COS) to GPU +memory at wire speed based on complex relational queries. We use Hierarchical +Statistical Indices (HSI) for query acceleration. Our architecture allows to +directly address tensors on block level using HTTP range reads. Once in GPU +memory, data can be transformed using PyTorch transforms. We provide a generic +PyTorch dataset type with a corresponding dataset factory translating +relational queries and requested transformations as an instance. By making use +of the HSI, irrelevant blocks can be skipped without reading them as those +indices contain statistics on their content at different hierarchical +resolution levels. This is an opinionated architecture powered by open +standards and making heavy use of open-source technology. Although, hardened +for production use using geospatial-temporal data, this architecture +generalizes to other use case like computer vision, computational neuroscience, +biological sequence analysis and more. + +
+
+
+
+
+ + ☆ An Efficient Approach to Unsupervised Out-of-Distribution Detection with + Variational Autoencoders + + +
+ This paper is concerned with deep generative models (DGMs) for unsupervised +out-of-distribution (OOD) detection. In particular, we focus on vanilla +Variational Autoencoders (VAE) that use a standard normal prior distribution +for the latent variables. These models have a smaller model size, enabling +faster training and inference, making them well-suited for resource-limited +applications compared to more complex DGMs. We propose a novel OOD score called +Error Reduction (ER) specifically designed for vanilla VAE. ER incorporate the +idea of reconstructing image inputs from their lossy counterparts and takes +into account the Kolmogorov complexity of the images. Experimental results on +diverse datasets demonstrate the superiority of our approach over baseline +methods. Our code is available at: https://github.com/ZJLAB-AMMI/VAE4OOD. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ BeeTLe: A Framework for Linear B-Cell Epitope Prediction and + Classification ECML + + +
+ The process of identifying and characterizing B-cell epitopes, which are the +portions of antigens recognized by antibodies, is important for our +understanding of the immune system, and for many applications including vaccine +development, therapeutics, and diagnostics. Computational epitope prediction is +challenging yet rewarding as it significantly reduces the time and cost of +laboratory work. Most of the existing tools do not have satisfactory +performance and only discriminate epitopes from non-epitopes. This paper +presents a new deep learning-based multi-task framework for linear B-cell +epitope prediction as well as antibody type-specific epitope classification. +Specifically, a sequenced-based neural network model using recurrent layers and +Transformer blocks is developed. We propose an amino acid encoding method based +on eigen decomposition to help the model learn the representations of epitopes. +We introduce modifications to standard cross-entropy loss functions by +extending a logit adjustment technique to cope with the class imbalance. +Experimental results on data curated from the largest public epitope database +demonstrate the validity of the proposed methods and the superior performance +compared to competing ones. + +
+
+ comment: 18 pages, 3 figures, accepted at ECML PKDD 2023 +
+
+
+
+
+ + ☆ Efficiency is Not Enough: A Critical Perspective of Environmentally + Sustainable AI + + +
+ Artificial Intelligence (AI) is currently spearheaded by machine learning +(ML) methods such as deep learning (DL) which have accelerated progress on many +tasks thought to be out of reach of AI. These ML methods can often be compute +hungry, energy intensive, and result in significant carbon emissions, a known +driver of anthropogenic climate change. Additionally, the platforms on which ML +systems run are associated with environmental impacts including and beyond +carbon emissions. The solution lionized by both industry and the ML community +to improve the environmental sustainability of ML is to increase the efficiency +with which ML systems operate in terms of both compute and energy consumption. +In this perspective, we argue that efficiency alone is not enough to make ML as +a technology environmentally sustainable. We do so by presenting three high +level discrepancies between the effect of efficiency on the environmental +sustainability of ML when considering the many variables which it interacts +with. In doing so, we comprehensively demonstrate, at multiple levels of +granularity both technical and non-technical reasons, why efficiency is not +enough to fully remedy the environmental impacts of ML. Based on this, we +present and argue for systems thinking as a viable path towards improving the +environmental sustainability of ML holistically. + +
+
+ comment: 24 pages; 6 figures +
+
+
+
+
+ + ☆ MvFS: Multi-view Feature Selection for Recommender System CIKM 2023 + + +
+ Feature selection, which is a technique to select key features in recommender +systems, has received increasing research attention. Recently, Adaptive Feature +Selection (AdaFS) has shown remarkable performance by adaptively selecting +features for each data instance, considering that the importance of a given +feature field can vary significantly across data. However, this method still +has limitations in that its selection process could be easily biased to major +features that frequently occur. To address these problems, we propose +Multi-view Feature Selection (MvFS), which selects informative features for +each instance more effectively. Most importantly, MvFS employs a multi-view +network consisting of multiple sub-networks, each of which learns to measure +the feature importance of a part of data with different feature patterns. By +doing so, MvFS promotes a more balanced feature selection process mitigating +the bias problem towards dominant patterns. Moreover, MvFS adopts an effective +importance score modeling strategy which is applied independently to each field +without incurring dependency among features. Experimental results on real-world +datasets demonstrate the effectiveness of MvFS compared to state-of-the-art +baselines. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ No-Regret Caching with Noisy Request Estimates + + +
+ Online learning algorithms have been successfully used to design caching +policies with regret guarantees. Existing algorithms assume that the cache +knows the exact request sequence, but this may not be feasible in high load +and/or memory-constrained scenarios, where the cache may have access only to +sampled requests or to approximate requests' counters. In this paper, we +propose the Noisy-Follow-the-Perturbed-Leader (NFPL) algorithm, a variant of +the classic Follow-the-Perturbed-Leader (FPL) when request estimates are noisy, +and we show that the proposed solution has sublinear regret under specific +conditions on the requests estimator. The experimental evaluation compares the +proposed solution against classic caching policies and validates the proposed +approach under both synthetic and real request traces. + +
+
+
+
+
+ + ☆ Model-agnostic network inference enhancement from noisy measurements via + curriculum learning + + +
+ Noise is a pervasive element within real-world measurement data, +significantly undermining the performance of network inference models. However, +the quest for a comprehensive enhancement framework capable of bolstering noise +resistance across a diverse array of network inference models has remained +elusive. Here, we present an elegant and efficient framework tailored to +amplify the capabilities of network inference models in the presence of noise. +Leveraging curriculum learning, we mitigate the deleterious impact of noisy +samples on network inference models. Our proposed framework is model-agnostic, +seamlessly integrable into a plethora of model-based and model-free network +inference methods. Notably, we utilize one model-based and three model-free +network inference methods as the foundation. Extensive experimentation across +various synthetic and real-world networks, encapsulating diverse nonlinear +dynamic processes, showcases substantial performance augmentation under varied +noise types, particularly thriving in scenarios enriched with clean samples. +This framework's adeptness in fortifying both model-free and model-based +network inference methodologies paves the avenue towards a comprehensive and +unified enhancement framework, encompassing the entire spectrum of network +inference models. Available Code: https://github.com/xiaoyuans/MANIE. + +
+
+
+
+
+ + ☆ Probabilistic Self-supervised Learning via Scoring Rules Minimization + + +
+ In this paper, we propose a novel probabilistic self-supervised learning via +Scoring Rule Minimization (ProSMIN), which leverages the power of probabilistic +models to enhance representation quality and mitigate collapsing +representations. Our proposed approach involves two neural networks; the online +network and the target network, which collaborate and learn the diverse +distribution of representations from each other through knowledge distillation. +By presenting the input samples in two augmented formats, the online network is +trained to predict the target network representation of the same sample under a +different augmented view. The two networks are trained via our new loss +function based on proper scoring rules. We provide a theoretical justification +for ProSMIN's convergence, demonstrating the strict propriety of its modified +scoring rule. This insight validates the method's optimization process and +contributes to its robustness and effectiveness in improving representation +quality. We evaluate our probabilistic model on various downstream tasks, such +as in-distribution generalization, out-of-distribution detection, dataset +corruption, low-shot learning, and transfer learning. Our method achieves +superior accuracy and calibration, surpassing the self-supervised baseline in a +wide range of experiments on large-scale datasets like ImageNet-O and +ImageNet-C, ProSMIN demonstrates its scalability and real-world applicability. + +
+
+
+
+
+ + ☆ Enhance Multi-domain Sentiment Analysis of Review Texts through + Prompting Strategies + + +
+ Large Language Models (LLMs) have made significant strides in both scientific +research and practical applications. Existing studies have demonstrated the +state-of-the-art (SOTA) performance of LLMs in various natural language +processing tasks. However, the question of how to further enhance LLMs' +performance in specific task using prompting strategies remains a pivotal +concern. This paper explores the enhancement of LLMs' performance in sentiment +analysis through the application of prompting strategies. We formulate the +process of prompting for sentiment analysis tasks and introduce two novel +strategies tailored for sentiment analysis: RolePlaying (RP) prompting and +Chain-of-thought (CoT) prompting. Specifically, we also propose the RP-CoT +prompting strategy which is a combination of RP prompting and CoT prompting. We +conduct comparative experiments on three distinct domain datasets to evaluate +the effectiveness of the proposed sentiment analysis strategies. The results +demonstrate that the adoption of the proposed prompting strategies leads to a +increasing enhancement in sentiment analysis accuracy. Further, the CoT +prompting strategy exhibits a notable impact on implicit sentiment analysis, +with the RP-CoT prompting strategy delivering the most superior performance +among all strategies. + +
+
+
+
+
+ + ☆ Diffusion Generative Inverse Design ICML + + +
+ Inverse design refers to the problem of optimizing the input of an objective +function in order to enact a target outcome. For many real-world engineering +problems, the objective function takes the form of a simulator that predicts +how the system state will evolve over time, and the design challenge is to +optimize the initial conditions that lead to a target outcome. Recent +developments in learned simulation have shown that graph neural networks (GNNs) +can be used for accurate, efficient, differentiable estimation of simulator +dynamics, and support high-quality design optimization with gradient- or +sampling-based optimization procedures. However, optimizing designs from +scratch requires many expensive model queries, and these procedures exhibit +basic failures on either non-convex or high-dimensional problems.In this work, +we show how denoising diffusion models (DDMs) can be used to solve inverse +design problems efficiently and propose a particle sampling algorithm for +further improving their efficiency. We perform experiments on a number of fluid +dynamics design challenges, and find that our approach substantially reduces +the number of calls to the simulator compared to standard techniques. + +
+
+ comment: ICML workshop on Structured Probabilistic Inference & Generative + Modeling +
+
+
+
+
+ + ☆ Data-Juicer: A One-Stop Data Processing System for Large Language Models + + +
+ The immense evolution in Large Language Models (LLMs) has underscored the +importance of massive, diverse, and high-quality data. Despite this, existing +open-source tools for LLM data processing remain limited and mostly tailored to +specific datasets, with an emphasis on the reproducibility of released data +over adaptability and usability, inhibiting potential applications. In +response, we propose a one-stop, powerful yet flexible and user-friendly LLM +data processing system named Data-Juicer. Our system offers over 50 built-in +versatile operators and pluggable tools, which synergize modularity, +composability, and extensibility dedicated to diverse LLM data processing +needs. By incorporating visualized and automatic evaluation capabilities, +Data-Juicer enables a timely feedback loop to accelerate data processing and +gain data insights. To enhance usability, Data-Juicer provides out-of-the-box +components for users with various backgrounds, and fruitful data recipes for +LLM pre-training and post-tuning usages. Further, we employ multi-facet system +optimization and seamlessly integrate Data-Juicer with both LLM and distributed +computing ecosystems, to enable efficient and scalable data processing. +Empirical validation of the generated data recipes reveals considerable +improvements in LLaMA performance for various pre-training and post-tuning +cases, demonstrating up to 7.45% relative improvement of averaged score across +16 LLM benchmarks and 16.25% higher win rate using pair-wise GPT-4 evaluation. +The system's efficiency and scalability are also validated, supported by up to +88.7% reduction in single-machine processing time, 77.1% and 73.1% less memory +and CPU usage respectively, and 7.91x processing acceleration when utilizing +distributed computing ecosystems. Our system, data recipes, and multiple +tutorial demos are released, calling for broader research centered on LLM data. + +
+
+ comment: Under continuous maintenance and updating; The system, refined data + recipes, and demos are at https://github.com/alibaba/data-juicer +
+
+
+
+
+ + ☆ Non-Parametric Representation Learning with Kernels + + +
+ Unsupervised and self-supervised representation learning has become popular +in recent years for learning useful features from unlabelled data. +Representation learning has been mostly developed in the neural network +literature, and other models for representation learning are surprisingly +unexplored. In this work, we introduce and analyze several kernel-based +representation learning approaches: Firstly, we define two kernel +Self-Supervised Learning (SSL) models using contrastive loss functions and +secondly, a Kernel Autoencoder (AE) model based on the idea of embedding and +reconstructing data. We argue that the classical representer theorems for +supervised kernel machines are not always applicable for (self-supervised) +representation learning, and present new representer theorems, which show that +the representations learned by our kernel models can be expressed in terms of +kernel matrices. We further derive generalisation error bounds for +representation learning with kernel SSL and AE, and empirically evaluate the +performance of these methods in both small data regimes as well as in +comparison with neural network based models. + +
+
+
+
+
+ + ☆ Granger Causal Inference in Multivariate Hawkes Processes by Minimum + Message Length + + +
+ Multivariate Hawkes processes (MHPs) are versatile probabilistic tools used +to model various real-life phenomena: earthquakes, operations on stock markets, +neuronal activity, virus propagation and many others. In this paper, we focus +on MHPs with exponential decay kernels and estimate connectivity graphs, which +represent the Granger causal relations between their components. We approach +this inference problem by proposing an optimization criterion and model +selection algorithm based on the minimum message length (MML) principle. MML +compares Granger causal models using the Occam's razor principle in the +following way: even when models have a comparable goodness-of-fit to the +observed data, the one generating the most concise explanation of the data is +preferred. While most of the state-of-art methods using lasso-type penalization +tend to overfitting in scenarios with short time horizons, the proposed +MML-based method achieves high F1 scores in these settings. We conduct a +numerical study comparing the proposed algorithm to other related classical and +state-of-art methods, where we achieve the highest F1 scores in specific sparse +graph settings. We illustrate the proposed method also on G7 sovereign bond +data and obtain causal connections, which are in agreement with the expert +knowledge available in the literature. + +
+
+ comment: 23 pages, 5 figures +
+
+
+
+
+ + ☆ RDGSL: Dynamic Graph Representation Learning with Structure Learning + + +
+ Temporal Graph Networks (TGNs) have shown remarkable performance in learning +representation for continuous-time dynamic graphs. However, real-world dynamic +graphs typically contain diverse and intricate noise. Noise can significantly +degrade the quality of representation generation, impeding the effectiveness of +TGNs in downstream tasks. Though structure learning is widely applied to +mitigate noise in static graphs, its adaptation to dynamic graph settings poses +two significant challenges. i) Noise dynamics. Existing structure learning +methods are ill-equipped to address the temporal aspect of noise, hampering +their effectiveness in such dynamic and ever-changing noise patterns. ii) More +severe noise. Noise may be introduced along with multiple interactions between +two nodes, leading to the re-pollution of these nodes and consequently causing +more severe noise compared to static graphs. In this paper, we present RDGSL, a +representation learning method in continuous-time dynamic graphs. Meanwhile, we +propose dynamic graph structure learning, a novel supervisory signal that +empowers RDGSL with the ability to effectively combat noise in dynamic graphs. +To address the noise dynamics issue, we introduce the Dynamic Graph Filter, +where we innovatively propose a dynamic noise function that dynamically +captures both current and historical noise, enabling us to assess the temporal +aspect of noise and generate a denoised graph. We further propose the Temporal +Embedding Learner to tackle the challenge of more severe noise, which utilizes +an attention mechanism to selectively turn a blind eye to noisy edges and hence +focus on normal edges, enhancing the expressiveness for representation +generation that remains resilient to noise. Our method demonstrates robustness +towards downstream tasks, resulting in up to 5.1% absolute AUC improvement in +evolving classification versus the second-best baseline. + +
+
+
+
+
+ + ☆ Dynamic Early Exiting Predictive Coding Neural Networks + + +
+ Internet of Things (IoT) sensors are nowadays heavily utilized in various +real-world applications ranging from wearables to smart buildings passing by +agrotechnology and health monitoring. With the huge amounts of data generated +by these tiny devices, Deep Learning (DL) models have been extensively used to +enhance them with intelligent processing. However, with the urge for smaller +and more accurate devices, DL models became too heavy to deploy. It is thus +necessary to incorporate the hardware's limited resources in the design +process. Therefore, inspired by the human brain known for its efficiency and +low power consumption, we propose a shallow bidirectional network based on +predictive coding theory and dynamic early exiting for halting further +computations when a performance threshold is surpassed. We achieve comparable +accuracy to VGG-16 in image classification on CIFAR-10 with fewer parameters +and less computational complexity. + +
+
+
+
+
+ + ☆ PROMISE: Preconditioned Stochastic Optimization Methods by Incorporating + Scalable Curvature Estimates + + +
+ This paper introduces PROMISE ($\textbf{Pr}$econditioned Stochastic +$\textbf{O}$ptimization $\textbf{M}$ethods by $\textbf{I}$ncorporating +$\textbf{S}$calable Curvature $\textbf{E}$stimates), a suite of sketching-based +preconditioned stochastic gradient algorithms for solving large-scale convex +optimization problems arising in machine learning. PROMISE includes +preconditioned versions of SVRG, SAGA, and Katyusha; each algorithm comes with +a strong theoretical analysis and effective default hyperparameter values. In +contrast, traditional stochastic gradient methods require careful +hyperparameter tuning to succeed, and degrade in the presence of +ill-conditioning, a ubiquitous phenomenon in machine learning. Empirically, we +verify the superiority of the proposed algorithms by showing that, using +default hyperparameter values, they outperform or match popular tuned +stochastic gradient optimizers on a test bed of $51$ ridge and logistic +regression problems assembled from benchmark machine learning repositories. On +the theoretical side, this paper introduces the notion of quadratic regularity +in order to establish linear convergence of all proposed methods even when the +preconditioner is updated infrequently. The speed of linear convergence is +determined by the quadratic regularity ratio, which often provides a tighter +bound on the convergence rate compared to the condition number, both in theory +and in practice, and explains the fast global linear convergence of the +proposed methods. + +
+
+ comment: 127 pages, 31 Figures +
+
+
+
+
+ + ☆ iLoRE: Dynamic Graph Representation with Instant Long-term Modeling and + Re-occurrence Preservation + + +
+ Continuous-time dynamic graph modeling is a crucial task for many real-world +applications, such as financial risk management and fraud detection. Though +existing dynamic graph modeling methods have achieved satisfactory results, +they still suffer from three key limitations, hindering their scalability and +further applicability. i) Indiscriminate updating. For incoming edges, existing +methods would indiscriminately deal with them, which may lead to more time +consumption and unexpected noisy information. ii) Ineffective node-wise +long-term modeling. They heavily rely on recurrent neural networks (RNNs) as a +backbone, which has been demonstrated to be incapable of fully capturing +node-wise long-term dependencies in event sequences. iii) Neglect of +re-occurrence patterns. Dynamic graphs involve the repeated occurrence of +neighbors that indicates their importance, which is disappointedly neglected by +existing methods. In this paper, we present iLoRE, a novel dynamic graph +modeling method with instant node-wise Long-term modeling and Re-occurrence +preservation. To overcome the indiscriminate updating issue, we introduce the +Adaptive Short-term Updater module that will automatically discard the useless +or noisy edges, ensuring iLoRE's effectiveness and instant ability. We further +propose the Long-term Updater to realize more effective node-wise long-term +modeling, where we innovatively propose the Identity Attention mechanism to +empower a Transformer-based updater, bypassing the limited effectiveness of +typical RNN-dominated designs. Finally, the crucial re-occurrence patterns are +also encoded into a graph module for informative representation learning, which +will further improve the expressiveness of our method. Our experimental results +on real-world datasets demonstrate the effectiveness of our iLoRE for dynamic +graph modeling. + +
+
+
+
+
+ + ☆ Representation Learning Dynamics of Self-Supervised Models + + +
+ Self-Supervised Learning (SSL) is an important paradigm for learning +representations from unlabelled data, and SSL with neural networks has been +highly successful in practice. However current theoretical analysis of SSL is +mostly restricted to generalisation error bounds. In contrast, learning +dynamics often provide a precise characterisation of the behaviour of neural +networks based models but, so far, are mainly known in supervised settings. In +this paper, we study the learning dynamics of SSL models, specifically +representations obtained by minimising contrastive and non-contrastive losses. +We show that a naive extension of the dymanics of multivariate regression to +SSL leads to learning trivial scalar representations that demonstrates +dimension collapse in SSL. Consequently, we formulate SSL objectives with +orthogonality constraints on the weights, and derive the exact (network width +independent) learning dynamics of the SSL models trained using gradient descent +on the Grassmannian manifold. We also argue that the infinite width +approximation of SSL models significantly deviate from the neural tangent +kernel approximations of supervised models. We numerically illustrate the +validity of our theoretical findings, and discuss how the presented results +provide a framework for further theoretical analysis of contrastive and +non-contrastive SSL. + +
+
+
+
+
+ + ☆ Establishing a real-time traffic alarm in the city of Valencia with Deep + Learning + + +
+ Urban traffic emissions represent a significant concern due to their +detrimental impacts on both public health and the environment. Consequently, +decision-makers have flagged their reduction as a crucial goal. In this study, +we first analyze the correlation between traffic flux and pollution in the city +of Valencia, Spain. Our results demonstrate that traffic has a significant +impact on the levels of certain pollutants (especially $\text{NO}_\text{x}$). +Secondly, we develop an alarm system to predict if a street is likely to +experience unusually high traffic in the next 30 minutes, using an independent +three-tier level for each street. To make the predictions, we use traffic data +updated every 10 minutes and Long Short-Term Memory (LSTM) neural networks. We +trained the LSTM using traffic data from 2018, and tested it using traffic data +from 2019. + +
+
+ comment: 12 pages, 13 figures +
+
+
+
+
+ + ☆ Aggregating Correlated Estimations with (Almost) no Training + + +
+ Many decision problems cannot be solved exactly and use several estimation +algorithms that assign scores to the different available options. The +estimation errors can have various correlations, from low (e.g. between two +very different approaches) to high (e.g. when using a given algorithm with +different hyperparameters). Most aggregation rules would suffer from this +diversity of correlations. In this article, we propose different aggregation +rules that take correlations into account, and we compare them to naive rules +in various experiments based on synthetic data. Our results show that when +sufficient information is known about the correlations between errors, a +maximum likelihood aggregation should be preferred. Otherwise, typically with +limited training data, we recommend a method that we call Embedded Voting (EV). + +
+
+
+
+
+ + ☆ Analyzing domain shift when using additional data for the MICCAI KiTS23 + Challenge + + +
+ Using additional training data is known to improve the results, especially +for medical image 3D segmentation where there is a lack of training material +and the model needs to generalize well from few available data. However, the +new data could have been acquired using other instruments and preprocessed such +its distribution is significantly different from the original training data. +Therefore, we study techniques which ameliorate domain shift during training so +that the additional data becomes better usable for preprocessing and training +together with the original data. Our results show that transforming the +additional data using histogram matching has better results than using simple +normalization. + +
+
+ comment: This preprint has not undergone peer review or any post-submission + improvements or corrections. The Version of Record of this contribution is + published in [TODO], and is available online at https://doi.org/[TODO] +
+
+
+
+
+ + ☆ sasdim: self-adaptive noise scaling diffusion model for spatial time + series imputation + + +
+ Spatial time series imputation is critically important to many real +applications such as intelligent transportation and air quality monitoring. +Although recent transformer and diffusion model based approaches have achieved +significant performance gains compared with conventional statistic based +methods, spatial time series imputation still remains as a challenging issue +due to the complex spatio-temporal dependencies and the noise uncertainty of +the spatial time series data. Especially, recent diffusion process based models +may introduce random noise to the imputations, and thus cause negative impact +on the model performance. To this end, we propose a self-adaptive noise scaling +diffusion model named SaSDim to more effectively perform spatial time series +imputation. Specially, we propose a new loss function that can scale the noise +to the similar intensity, and propose the across spatial-temporal global +convolution module to more effectively capture the dynamic spatial-temporal +dependencies. Extensive experiments conducted on three real world datasets +verify the effectiveness of SaSDim by comparison with current state-of-the-art +baselines. + +
+
+
+
+
+ + ☆ An LSTM-Based Predictive Monitoring Method for Data with Time-varying + Variability + + +
+ The recurrent neural network and its variants have shown great success in +processing sequences in recent years. However, this deep neural network has not +aroused much attention in anomaly detection through predictively process +monitoring. Furthermore, the traditional statistic models work on assumptions +and hypothesis tests, while neural network (NN) models do not need that many +assumptions. This flexibility enables NN models to work efficiently on data +with time-varying variability, a common inherent aspect of data in practice. +This paper explores the ability of the recurrent neural network structure to +monitor processes and proposes a control chart based on long short-term memory +(LSTM) prediction intervals for data with time-varying variability. The +simulation studies provide empirical evidence that the proposed model +outperforms other NN-based predictive monitoring methods for mean shift +detection. The proposed method is also applied to time series sensor data, +which confirms that the proposed method is an effective technique for detecting +abnormalities. + +
+
+ comment: 19 pages, 9 figures, 6 tables +
+
+
+
+
+ + ☆ Linear Regression using Heterogeneous Data Batches + + +
+ In many learning applications, data are collected from multiple sources, each +providing a \emph{batch} of samples that by itself is insufficient to learn its +input-output relationship. A common approach assumes that the sources fall in +one of several unknown subgroups, each with an unknown input distribution and +input-output relationship. We consider one of this setup's most fundamental and +important manifestations where the output is a noisy linear combination of the +inputs, and there are $k$ subgroups, each with its own regression vector. Prior +work~\cite{kong2020meta} showed that with abundant small-batches, the +regression vectors can be learned with only few, $\tilde\Omega( k^{3/2})$, +batches of medium-size with $\tilde\Omega(\sqrt k)$ samples each. However, the +paper requires that the input distribution for all $k$ subgroups be isotropic +Gaussian, and states that removing this assumption is an ``interesting and +challenging problem". We propose a novel gradient-based algorithm that improves +on the existing results in several ways. It extends the applicability of the +algorithm by: (1) allowing the subgroups' underlying input distributions to be +different, unknown, and heavy-tailed; (2) recovering all subgroups followed by +a significant proportion of batches even for infinite $k$; (3) removing the +separation requirement between the regression vectors; (4) reducing the number +of batches and allowing smaller batch sizes. + +
+
+
+
+
+ + ☆ AdaPlus: Integrating Nesterov Momentum and Precise Stepsize Adjustment + on AdamW Basis + + +
+ This paper proposes an efficient optimizer called AdaPlus which integrates +Nesterov momentum and precise stepsize adjustment on AdamW basis. AdaPlus +combines the advantages of AdamW, Nadam, and AdaBelief and, in particular, does +not introduce any extra hyper-parameters. We perform extensive experimental +evaluations on three machine learning tasks to validate the effectiveness of +AdaPlus. The experiment results validate that AdaPlus (i) is the best adaptive +method which performs most comparable with (even slightly better than) SGD with +momentum on image classification tasks and (ii) outperforms other +state-of-the-art optimizers on language modeling tasks and illustrates the +highest stability when training GANs. The experiment code of AdaPlus is +available at: https://github.com/guanleics/AdaPlus. + +
+
+
+
+
+ + ☆ RADIO: Reference-Agnostic Dubbing Video Synthesis + + +
+ One of the most challenging problems in audio-driven talking head generation +is achieving high-fidelity detail while ensuring precise synchronization. Given +only a single reference image, extracting meaningful identity attributes +becomes even more challenging, often causing the network to mirror the facial +and lip structures too closely. To address these issues, we introduce RADIO, a +framework engineered to yield high-quality dubbed videos regardless of the pose +or expression in reference images. The key is to modulate the decoder layers +using latent space composed of audio and reference features. Additionally, we +incorporate ViT blocks into the decoder to emphasize high-fidelity details, +especially in the lip region. Our experimental results demonstrate that RADIO +displays high synchronization without the loss of fidelity. Especially in harsh +scenarios where the reference frame deviates significantly from the ground +truth, our method outperforms state-of-the-art methods, highlighting its +robustness. Pre-trained model and codes will be made public after the review. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression + For On-device ASR Models ICASSP 2024 + + +
+ Automatic Speech Recognition (ASR) models need to be optimized for specific +hardware before they can be deployed on devices. This can be done by tuning the +model's hyperparameters or exploring variations in its architecture. +Re-training and re-validating models after making these changes can be a +resource-intensive task. This paper presents TODM (Train Once Deploy Many), a +new approach to efficiently train many sizes of hardware-friendly on-device ASR +models with comparable GPU-hours to that of a single training job. TODM +leverages insights from prior work on Supernet, where Recurrent Neural Network +Transducer (RNN-T) models share weights within a Supernet. It reduces layer +sizes and widths of the Supernet to obtain subnetworks, making them smaller +models suitable for all hardware types. We introduce a novel combination of +three techniques to improve the outcomes of the TODM Supernet: adaptive +dropouts, an in-place Alpha-divergence knowledge distillation, and the use of +ScaledAdam optimizer. We validate our approach by comparing Supernet-trained +versus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using +LibriSpeech. Results demonstrate that our TODM Supernet either matches or +surpasses the performance of manually tuned models by up to a relative of 3% +better in word error rate (WER), while efficiently keeping the cost of training +many models at a small constant. + +
+
+ comment: Meta AI; Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ OHQ: On-chip Hardware-aware Quantization + + +
+ Quantization emerges as one of the most promising approaches for deploying +advanced deep models on resource-constrained hardware. Mixed-precision +quantization leverages multiple bit-width architectures to unleash the accuracy +and efficiency potential of quantized models. However, existing mixed-precision +quantization suffers exhaustive search space that causes immense computational +overhead. The quantization process thus relies on separate high-performance +devices rather than locally, which also leads to a significant gap between the +considered hardware metrics and the real deployment.In this paper, we propose +an On-chip Hardware-aware Quantization (OHQ) framework that performs +hardware-aware mixed-precision quantization without accessing online devices. +First, we construct the On-chip Quantization Awareness (OQA) pipeline, enabling +perceive the actual efficiency metrics of the quantization operator on the +hardware.Second, we propose Mask-guided Quantization Estimation (MQE) technique +to efficiently estimate the accuracy metrics of operators under the constraints +of on-chip-level computing power.By synthesizing network and hardware insights +through linear programming, we obtain optimized bit-width configurations. +Notably, the quantization process occurs on-chip entirely without any +additional computing devices and data access. We demonstrate accelerated +inference after quantization for various architectures and compression ratios, +achieving 70% and 73% accuracy for ResNet-18 and MobileNetV3, respectively. OHQ +improves latency by 15~30% compared to INT8 on deployment. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Dynamic Brain Transformer with Multi-level Attention for Functional + Brain Network Analysis + + +
+ Recent neuroimaging studies have highlighted the importance of +network-centric brain analysis, particularly with functional magnetic resonance +imaging. The emergence of Deep Neural Networks has fostered a substantial +interest in predicting clinical outcomes and categorizing individuals based on +brain networks. However, the conventional approach involving static brain +network analysis offers limited potential in capturing the dynamism of brain +function. Although recent studies have attempted to harness dynamic brain +networks, their high dimensionality and complexity present substantial +challenges. This paper proposes a novel methodology, Dynamic bRAin Transformer +(DART), which combines static and dynamic brain networks for more effective and +nuanced brain function analysis. Our model uses the static brain network as a +baseline, integrating dynamic brain networks to enhance performance against +traditional methods. We innovatively employ attention mechanisms, enhancing +model explainability and exploiting the dynamic brain network's temporal +variations. The proposed approach offers a robust solution to the low +signal-to-noise ratio of blood-oxygen-level-dependent signals, a recurring +issue in direct DNN modeling. It also provides valuable insights into which +brain circuits or dynamic networks contribute more to final predictions. As +such, DRAT shows a promising direction in neuroimaging studies, contributing to +the comprehensive understanding of brain organization and the role of neural +circuits. + +
+
+ comment: Accepted to IEEE BHI 2023 +
+
+
+
+
+ + ☆ Provably safe systems: the only path to controllable AGI + + +
+ We describe a path to humanity safely thriving with powerful Artificial +General Intelligences (AGIs) by building them to provably satisfy +human-specified requirements. We argue that this will soon be technically +feasible using advanced AI for formal verification and mechanistic +interpretability. We further argue that it is the only path which guarantees +safe controlled AGI. We end with a list of challenge problems whose solution +would contribute to this positive outcome and invite readers to join in this +work. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Regret Analysis of Policy Gradient Algorithm for Infinite Horizon + Average Reward Markov Decision Processes + + +
+ In this paper, we consider an infinite horizon average reward Markov Decision +Process (MDP). Distinguishing itself from existing works within this context, +our approach harnesses the power of the general policy gradient-based +algorithm, liberating it from the constraints of assuming a linear MDP +structure. We propose a policy gradient-based algorithm and show its global +convergence property. We then prove that the proposed algorithm has +$\tilde{\mathcal{O}}({T}^{3/4})$ regret. Remarkably, this paper marks a +pioneering effort by presenting the first exploration into regret-bound +computation for the general parameterized policy gradient algorithm in the +context of average reward scenarios. + +
+
+
+
+
+ + ☆ RoboAgent: Generalization and Efficiency in Robot Manipulation via + Semantic Augmentations and Action Chunking + + +
+ The grand aim of having a single robot that can manipulate arbitrary objects +in diverse settings is at odds with the paucity of robotics datasets. Acquiring +and growing such datasets is strenuous due to manual efforts, operational +costs, and safety challenges. A path toward such an universal agent would +require a structured framework capable of wide generalization but trained +within a reasonable data budget. In this paper, we develop an efficient system +(RoboAgent) for training universal agents capable of multi-task manipulation +skills using (a) semantic augmentations that can rapidly multiply existing +datasets and (b) action representations that can extract performant policies +with small yet diverse multi-modal datasets without overfitting. In addition, +reliable task conditioning and an expressive policy architecture enable our +agent to exhibit a diverse repertoire of skills in novel situations specified +using language commands. Using merely 7500 demonstrations, we are able to train +a single agent capable of 12 unique skills, and demonstrate its generalization +over 38 tasks spread across common daily activities in diverse kitchen scenes. +On average, RoboAgent outperforms prior methods by over 40% in unseen +situations while being more sample efficient and being amenable to capability +improvements and extensions through fine-tuning. Videos at +https://robopen.github.io/ + +
+
+
+
+
+ + ☆ A Survey on Physics Informed Reinforcement Learning: Review and Open + Problems + + +
+ The inclusion of physical information in machine learning frameworks has +revolutionized many application areas. This involves enhancing the learning +process by incorporating physical constraints and adhering to physical laws. In +this work we explore their utility for reinforcement learning applications. We +present a thorough review of the literature on incorporating physics +information, as known as physics priors, in reinforcement learning approaches, +commonly referred to as physics-informed reinforcement learning (PIRL). We +introduce a novel taxonomy with the reinforcement learning pipeline as the +backbone to classify existing works, compare and contrast them, and derive +crucial insights. Existing works are analyzed with regard to the +representation/ form of the governing physics modeled for integration, their +specific contribution to the typical reinforcement learning architecture, and +their connection to the underlying reinforcement learning pipeline stages. We +also identify core learning architectures and physics incorporation biases +(i.e., observational, inductive and learning) of existing PIRL approaches and +use them to further categorize the works for better understanding and +adaptation. By providing a comprehensive perspective on the implementation of +the physics-informed capability, the taxonomy presents a cohesive approach to +PIRL. It identifies the areas where this approach has been applied, as well as +the gaps and opportunities that exist. Additionally, the taxonomy sheds light +on unresolved issues and challenges, which can guide future research. This +nascent field holds great potential for enhancing reinforcement learning +algorithms by increasing their physical plausibility, precision, data +efficiency, and applicability in real-world scenarios. + +
+
+
+
+
+ + ☆ Inferring Actual Treatment Pathways from Patient Records + + +
+ Treatment pathways are step-by-step plans outlining the recommended medical +care for specific diseases; they get revised when different treatments are +found to improve patient outcomes. Examining health records is an important +part of this revision process, but inferring patients' actual treatments from +health data is challenging due to complex event-coding schemes and the absence +of pathway-related annotations. This study aims to infer the actual treatment +steps for a particular patient group from administrative health records (AHR) - +a common form of tabular healthcare data - and address several technique- and +methodology-based gaps in treatment pathway-inference research. We introduce +Defrag, a method for examining AHRs to infer the real-world treatment steps for +a particular patient group. Defrag learns the semantic and temporal meaning of +healthcare event sequences, allowing it to reliably infer treatment steps from +complex healthcare data. To our knowledge, Defrag is the first +pathway-inference method to utilise a neural network (NN), an approach made +possible by a novel, self-supervised learning objective. We also developed a +testing and validation framework for pathway inference, which we use to +characterise and evaluate Defrag's pathway inference ability and compare +against baselines. We demonstrate Defrag's effectiveness by identifying +best-practice pathway fragments for breast cancer, lung cancer, and melanoma in +public healthcare records. Additionally, we use synthetic data experiments to +demonstrate the characteristics of the Defrag method, and to compare Defrag to +several baselines where it significantly outperforms non-NN-based methods. +Defrag significantly outperforms several existing pathway-inference methods and +offers an innovative and effective approach for inferring treatment pathways +from AHRs. Open-source code is provided to encourage further research in this +area. + +
+
+
+
+
+ + ☆ Extended Symmetry Preserving Attention Networks for LHC Analysis + + +
+ Reconstructing unstable heavy particles requires sophisticated techniques to +sift through the large number of possible permutations for assignment of +detector objects to partons. An approach based on a generalized attention +mechanism, symmetry preserving attention networks (SPANet), has been previously +applied to top quark pair decays at the Large Hadron Collider, which produce +six hadronic jets. Here we extend the SPANet architecture to consider multiple +input streams, such as leptons, as well as global event features, such as the +missing transverse momentum. In addition, we provide regression and +classification outputs to supplement the parton assignment. We explore the +performance of the extended capability of SPANet in the context of +semi-leptonic decays of top quark pairs as well as top quark pairs produced in +association with a Higgs boson. We find significant improvements in the power +of three representative studies: search for ttH, measurement of the top quark +mass and a search for a heavy Z' decaying to top quark pairs. We present +ablation studies to provide insight on what the network has learned in each +case. + +
+
+
+
+
+ + ☆ QuantEase: Optimization-based Quantization for Language Models -- An + Efficient and Intuitive Algorithm + + +
+ With the rising popularity of Large Language Models (LLMs), there has been an +increasing interest in compression techniques that enable their efficient +deployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs. +Drawing from recent advances, our work introduces QuantEase, a layer-wise +quantization framework where individual layers undergo separate quantization. +The problem is framed as a discrete-structured non-convex optimization, +prompting the development of algorithms rooted in Coordinate Descent (CD) +techniques. These CD-based methods provide high-quality solutions to the +complex non-convex layer-wise quantization problems. Notably, our CD-based +approach features straightforward updates, relying solely on matrix and vector +operations, circumventing the need for matrix inversion or decomposition. We +also explore an outlier-aware variant of our approach, allowing for retaining +significant weights (outliers) with complete precision. Our proposal attains +state-of-the-art performance in terms of perplexity and zero-shot accuracy in +empirical evaluations across various LLMs and datasets, with relative +improvements up to 15% over methods such as GPTQ. Particularly noteworthy is +our outlier-aware algorithm's capability to achieve near or sub-3-bit +quantization of LLMs with an acceptable drop in accuracy, obviating the need +for non-uniform quantization or grouping techniques, improving upon methods +such as SpQR by up to two times in terms of perplexity. + +
+
+
+
+
+ + ☆ Task Generalization with Stability Guarantees via Elastic Dynamical + System Motion Policies + + +
+ Dynamical System (DS) based Learning from Demonstration (LfD) allows learning +of reactive motion policies with stability and convergence guarantees from a +few trajectories. Yet, current DS learning techniques lack the flexibility to +generalize to new task instances as they ignore explicit task parameters that +inherently change the underlying trajectories. In this work, we propose +Elastic-DS, a novel DS learning, and generalization approach that embeds task +parameters into the Gaussian Mixture Model (GMM) based Linear Parameter Varying +(LPV) DS formulation. Central to our approach is the Elastic-GMM, a GMM +constrained to SE(3) task-relevant frames. Given a new task instance/context, +the Elastic-GMM is transformed with Laplacian Editing and used to re-estimate +the LPV-DS policy. Elastic-DS is compositional in nature and can be used to +construct flexible multi-step tasks. We showcase its strength on a myriad of +simulated and real-robot experiments while preserving desirable +control-theoretic guarantees. Supplementary videos can be found at +https://sites.google.com/view/elastic-ds + +
+
+ comment: Accepted to CoRL 2023 +
+
+
+
+
+ + ☆ Gradient Domain Diffusion Models for Image Synthesis + + +
+ Diffusion models are getting popular in generative image and video synthesis. +However, due to the diffusion process, they require a large number of steps to +converge. To tackle this issue, in this paper, we propose to perform the +diffusion process in the gradient domain, where the convergence becomes faster. +There are two reasons. First, thanks to the Poisson equation, the gradient +domain is mathematically equivalent to the original image domain. Therefore, +each diffusion step in the image domain has a unique corresponding gradient +domain representation. Second, the gradient domain is much sparser than the +image domain. As a result, gradient domain diffusion models converge faster. +Several numerical experiments confirm that the gradient domain diffusion models +are more efficient than the original diffusion models. The proposed method can +be applied in a wide range of applications such as image processing, computer +vision and machine learning tasks. + +
+
+
+
+
+ + ☆ Efficient Query-Based Attack against ML-Based Android Malware Detection + under Zero Knowledge Setting + + +
+ The widespread adoption of the Android operating system has made malicious +Android applications an appealing target for attackers. Machine learning-based +(ML-based) Android malware detection (AMD) methods are crucial in addressing +this problem; however, their vulnerability to adversarial examples raises +concerns. Current attacks against ML-based AMD methods demonstrate remarkable +performance but rely on strong assumptions that may not be realistic in +real-world scenarios, e.g., the knowledge requirements about feature space, +model parameters, and training dataset. To address this limitation, we +introduce AdvDroidZero, an efficient query-based attack framework against +ML-based AMD methods that operates under the zero knowledge setting. Our +extensive evaluation shows that AdvDroidZero is effective against various +mainstream ML-based AMD methods, in particular, state-of-the-art such methods +and real-world antivirus solutions. + +
+
+ comment: To Appear in the ACM Conference on Computer and Communications + Security, November, 2023 +
+
+
+
+
+ + ☆ Superclustering by finding statistically significant separable groups of + optimal gaussian clusters + + +
+ The paper presents the algorithm for clustering a dataset by grouping the +optimal, from the point of view of the BIC criterion, number of Gaussian +clusters into the optimal, from the point of view of their statistical +separability, superclusters. + The algorithm consists of three stages: representation of the dataset as a +mixture of Gaussian distributions - clusters, which number is determined based +on the minimum of the BIC criterion; using the Mahalanobis distance, to +estimate the distances between the clusters and cluster sizes; combining the +resulting clusters into superclusters using the DBSCAN method by finding its +hyperparameter (maximum distance) providing maximum value of introduced matrix +quality criterion at maximum number of superclusters. The matrix quality +criterion corresponds to the proportion of statistically significant separated +superclusters among all found superclusters. + The algorithm has only one hyperparameter - statistical significance level, +and automatically detects optimal number and shape of superclusters based of +statistical hypothesis testing approach. The algorithm demonstrates a good +results on test datasets in noise and noiseless situations. An essential +advantage of the algorithm is its ability to predict correct supercluster for +new data based on already trained clusterer and perform soft (fuzzy) +clustering. The disadvantages of the algorithm are: its low speed and +stochastic nature of the final clustering. It requires a sufficiently large +dataset for clustering, which is typical for many statistical methods. + +
+
+ comment: 32 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ Compressing Vision Transformers for Low-Resource Visual Learning + + +
+ Vision transformer (ViT) and its variants have swept through visual learning +leaderboards and offer state-of-the-art accuracy in tasks such as image +classification, object detection, and semantic segmentation by attending to +different parts of the visual input and capturing long-range spatial +dependencies. However, these models are large and computation-heavy. For +instance, the recently proposed ViT-B model has 86M parameters making it +impractical for deployment on resource-constrained devices. As a result, their +deployment on mobile and edge scenarios is limited. In our work, we aim to take +a step toward bringing vision transformers to the edge by utilizing popular +model compression techniques such as distillation, pruning, and quantization. + Our chosen application environment is an unmanned aerial vehicle (UAV) that +is battery-powered and memory-constrained, carrying a single-board computer on +the scale of an NVIDIA Jetson Nano with 4GB of RAM. On the other hand, the UAV +requires high accuracy close to that of state-of-the-art ViTs to ensure safe +object avoidance in autonomous navigation, or correct localization of humans in +search-and-rescue. Inference latency should also be minimized given the +application requirements. Hence, our target is to enable rapid inference of a +vision transformer on an NVIDIA Jetson Nano (4GB) with minimal accuracy loss. +This allows us to deploy ViTs on resource-constrained devices, opening up new +possibilities in surveillance, environmental monitoring, etc. Our +implementation is made available at https://github.com/chensy7/efficient-vit. + +
+
+
+
+
+ + ☆ Generative AI-aided Joint Training-free Secure Semantic Communications + via Multi-modal Prompts + + +
+ Semantic communication (SemCom) holds promise for reducing network resource +consumption while achieving the communications goal. However, the computational +overheads in jointly training semantic encoders and decoders-and the subsequent +deployment in network devices-are overlooked. Recent advances in Generative +artificial intelligence (GAI) offer a potential solution. The robust learning +abilities of GAI models indicate that semantic decoders can reconstruct source +messages using a limited amount of semantic information, e.g., prompts, without +joint training with the semantic encoder. A notable challenge, however, is the +instability introduced by GAI's diverse generation ability. This instability, +evident in outputs like text-generated images, limits the direct application of +GAI in scenarios demanding accurate message recovery, such as face image +transmission. To solve the above problems, this paper proposes a GAI-aided +SemCom system with multi-model prompts for accurate content decoding. Moreover, +in response to security concerns, we introduce the application of covert +communications aided by a friendly jammer. The system jointly optimizes the +diffusion step, jamming, and transmitting power with the aid of the generative +diffusion models, enabling successful and secure transmission of the source +messages. + +
+
+
+
+
+ + ☆ Generative Algorithms for Fusion of Physics-Based Wildfire Spread Models + with Satellite Data for Initializing Wildfire Forecasts + + +
+ Increases in wildfire activity and the resulting impacts have prompted the +development of high-resolution wildfire behavior models for forecasting fire +spread. Recent progress in using satellites to detect fire locations further +provides the opportunity to use measurements to improve fire spread forecasts +from numerical models through data assimilation. This work develops a method +for inferring the history of a wildfire from satellite measurements, providing +the necessary information to initialize coupled atmosphere-wildfire models from +a measured wildfire state in a physics-informed approach. The fire arrival +time, which is the time the fire reaches a given spatial location, acts as a +succinct representation of the history of a wildfire. In this work, a +conditional Wasserstein Generative Adversarial Network (cWGAN), trained with +WRF-SFIRE simulations, is used to infer the fire arrival time from satellite +active fire data. The cWGAN is used to produce samples of likely fire arrival +times from the conditional distribution of arrival times given satellite active +fire detections. Samples produced by the cWGAN are further used to assess the +uncertainty of predictions. The cWGAN is tested on four California wildfires +occurring between 2020 and 2022, and predictions for fire extent are compared +against high resolution airborne infrared measurements. Further, the predicted +ignition times are compared with reported ignition times. An average Sorensen's +coefficient of 0.81 for the fire perimeters and an average ignition time error +of 32 minutes suggest that the method is highly accurate. + +
+
+
+
+
+ + ☆ Utilizing Generative Adversarial Networks for Stable Structure + Generation in Angry Birds AAAI + + +
+ This paper investigates the suitability of using Generative Adversarial +Networks (GANs) to generate stable structures for the physics-based puzzle game +Angry Birds. While previous applications of GANs for level generation have been +mostly limited to tile-based representations, this paper explores their +suitability for creating stable structures made from multiple smaller blocks. +This includes a detailed encoding/decoding process for converting between Angry +Birds level descriptions and a suitable grid-based representation, as well as +utilizing state-of-the-art GAN architectures and training methods to produce +new structure designs. Our results show that GANs can be successfully applied +to generate a varied range of complex and stable Angry Birds structures. + +
+
+ comment: 11 pages, 10 figures, 2 tables, Accepted at the 19th AAAI Conference + on Artificial Intelligence and Interactive Digital Entertainment (AIIDE 23) +
+
+
+
+
+ + ☆ T-SaS: Toward Shift-aware Dynamic Adaptation for Streaming Data CIKM 2023 + + +
+ In many real-world scenarios, distribution shifts exist in the streaming data +across time steps. Many complex sequential data can be effectively divided into +distinct regimes that exhibit persistent dynamics. Discovering the shifted +behaviors and the evolving patterns underlying the streaming data are important +to understand the dynamic system. Existing methods typically train one robust +model to work for the evolving data of distinct distributions or sequentially +adapt the model utilizing explicitly given regime boundaries. However, there +are two challenges: (1) shifts in data streams could happen drastically and +abruptly without precursors. Boundaries of distribution shifts are usually +unavailable, and (2) training a shared model for all domains could fail to +capture varying patterns. This paper aims to solve the problem of sequential +data modeling in the presence of sudden distribution shifts that occur without +any precursors. Specifically, we design a Bayesian framework, dubbed as T-SaS, +with a discrete distribution-modeling variable to capture abrupt shifts of +data. Then, we design a model that enable adaptation with dynamic network +selection conditioned on that discrete variable. The proposed method learns +specific model parameters for each distribution by learning which neurons +should be activated in the full network. A dynamic masking strategy is adopted +here to support inter-distribution transfer through the overlapping of a set of +sparse networks. Extensive experiments show that our proposed method is +superior in both accurately detecting shift boundaries to get segments of +varying distributions and effectively adapting to downstream forecast or +classification tasks. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ Distributed Variational Inference for Online Supervised Learning + + +
+ Developing efficient solutions for inference problems in intelligent sensor +networks is crucial for the next generation of location, tracking, and mapping +services. This paper develops a scalable distributed probabilistic inference +algorithm that applies to continuous variables, intractable posteriors and +large-scale real-time data in sensor networks. In a centralized setting, +variational inference is a fundamental technique for performing approximate +Bayesian estimation, in which an intractable posterior density is approximated +with a parametric density. Our key contribution lies in the derivation of a +separable lower bound on the centralized estimation objective, which enables +distributed variational inference with one-hop communication in a sensor +network. Our distributed evidence lower bound (DELBO) consists of a weighted +sum of observation likelihood and divergence to prior densities, and its gap to +the measurement evidence is due to consensus and modeling errors. To solve +binary classification and regression problems while handling streaming data, we +design an online distributed algorithm that maximizes DELBO, and specialize it +to Gaussian variational densities with non-linear likelihoods. The resulting +distributed Gaussian variational inference (DGVI) efficiently inverts a +$1$-rank correction to the covariance matrix. Finally, we derive a diagonalized +version for online distributed inference in high-dimensional models, and apply +it to multi-robot probabilistic mapping using indoor LiDAR data. + +
+
+
+
+
+ + ☆ Screening of Pneumonia and Urinary Tract Infection at Triage using + TriNet + + +
+ Due to the steady rise in population demographics and longevity, emergency +department visits are increasing across North America. As more patients visit +the emergency department, traditional clinical workflows become overloaded and +inefficient, leading to prolonged wait-times and reduced healthcare quality. +One of such workflows is the triage medical directive, impeded by limited human +workload, inaccurate diagnoses and invasive over-testing. To address this +issue, we propose TriNet: a machine learning model for medical directives that +automates first-line screening at triage for conditions requiring downstream +testing for diagnosis confirmation. To verify screening potential, TriNet was +trained on hospital triage data and achieved high positive predictive values in +detecting pneumonia (0.86) and urinary tract infection (0.93). These models +outperform current clinical benchmarks, indicating that machine-learning +medical directives can offer cost-free, non-invasive screening with high +specificity for common conditions, reducing the risk of over-testing while +increasing emergency department efficiency. + +
+
+ comment: Index Terms: Downstream testing, Machine Learning, Medical + directives, Modelling, Modular network, Pneumonia, Positive predictive value, + Screening, Triage, Urinary tract infection +
+
+
+
+
+ + ☆ Self-Supervised Pretraining Improves Performance and Inference + Efficiency in Multiple Lung Ultrasound Interpretation Tasks + + +
+ In this study, we investigated whether self-supervised pretraining could +produce a neural network feature extractor applicable to multiple +classification tasks in B-mode lung ultrasound analysis. When fine-tuning on +three lung ultrasound tasks, pretrained models resulted in an improvement of +the average across-task area under the receiver operating curve (AUC) by 0.032 +and 0.061 on local and external test sets respectively. Compact nonlinear +classifiers trained on features outputted by a single pretrained model did not +improve performance across all tasks; however, they did reduce inference time +by 49% compared to serial execution of separate fine-tuned models. When +training using 1% of the available labels, pretrained models consistently +outperformed fully supervised models, with a maximum observed test AUC increase +of 0.396 for the task of view classification. Overall, the results indicate +that self-supervised pretraining is useful for producing initial weights for +lung ultrasound classifiers. + +
+
+ comment: 10 pages, 5 figures, submitted to IEEE Access +
+
+
+
+
+ + ☆ Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction + Tuning + + +
+ We present CM3Leon (pronounced "Chameleon"), a retrieval-augmented, +token-based, decoder-only multi-modal language model capable of generating and +infilling both text and images. CM3Leon uses the CM3 multi-modal architecture +but additionally shows the extreme benefits of scaling up and tuning on more +diverse instruction-style data. It is the first multi-modal model trained with +a recipe adapted from text-only language models, including a large-scale +retrieval-augmented pre-training stage and a second multi-task supervised +fine-tuning (SFT) stage. It is also a general-purpose model that can do both +text-to-image and image-to-text generation, allowing us to introduce +self-contained contrastive decoding methods that produce high-quality outputs. +Extensive experiments demonstrate that this recipe is highly effective for +multi-modal models. CM3Leon achieves state-of-the-art performance in +text-to-image generation with 5x less training compute than comparable methods +(zero-shot MS-COCO FID of 4.88). After SFT, CM3Leon can also demonstrate +unprecedented levels of controllability in tasks ranging from language-guided +image editing to image-controlled generation and segmentation. + +
+
+
+
+
+ + ☆ Representation Learning for Sequential Volumetric Design Tasks + + +
+ Volumetric design, also called massing design, is the first and critical step +in professional building design which is sequential in nature. As the +volumetric design process is complex, the underlying sequential design process +encodes valuable information for designers. Many efforts have been made to +automatically generate reasonable volumetric designs, but the quality of the +generated design solutions varies, and evaluating a design solution requires +either a prohibitively comprehensive set of metrics or expensive human +expertise. While previous approaches focused on learning only the final design +instead of sequential design tasks, we propose to encode the design knowledge +from a collection of expert or high-performing design sequences and extract +useful representations using transformer-based models. Later we propose to +utilize the learned representations for crucial downstream applications such as +design preference evaluation and procedural design generation. We develop the +preference model by estimating the density of the learned representations +whereas we train an autoregressive transformer model for sequential design +generation. We demonstrate our ideas by leveraging a novel dataset of thousands +of sequential volumetric designs. Our preference model can compare two +arbitrarily given design sequences and is almost 90% accurate in evaluation +against random design sequences. Our autoregressive model is also capable of +autocompleting a volumetric design sequence from a partial design sequence. + +
+
+
+
+
+ + ☆ Unveiling Intractable Epileptogenic Brain Networks with Deep Learning + Algorithms: A Novel and Comprehensive Framework for Scalable Seizure + Prediction with Unimodal Neuroimaging Data in Pediatric Patients + + +
+ Epilepsy is a prevalent neurological disorder affecting 50 million +individuals worldwide and 1.2 million Americans. There exist millions of +pediatric patients with intractable epilepsy, a condition in which seizures +fail to come under control. The occurrence of seizures can result in physical +injury, disorientation, unconsciousness, and additional symptoms that could +impede children's ability to participate in everyday tasks. Predicting seizures +can help parents and healthcare providers take precautions, prevent risky +situations, and mentally prepare children to minimize anxiety and nervousness +associated with the uncertainty of a seizure. This research proposes a novel +and comprehensive framework to predict seizures in pediatric patients by +evaluating machine learning algorithms on unimodal neuroimaging data consisting +of electroencephalogram signals. The bandpass filtering and independent +component analysis proved to be effective in reducing the noise and artifacts +from the dataset. Various machine learning algorithms' performance is evaluated +on important metrics such as accuracy, precision, specificity, sensitivity, F1 +score and MCC. The results show that the deep learning algorithms are more +successful in predicting seizures than logistic Regression, and k nearest +neighbors. The recurrent neural network (RNN) gave the highest precision and F1 +Score, long short-term memory (LSTM) outperformed RNN in accuracy and +convolutional neural network (CNN) resulted in the highest Specificity. This +research has significant implications for healthcare providers in proactively +managing seizure occurrence in pediatric patients, potentially transforming +clinical practices, and improving pediatric care. + +
+
+ comment: 9 pages, 15 figures +
+
+
+
+
+ + ☆ Anatomy-Driven Pathology Detection on Chest X-rays MICCAI 2023 + + +
+ Pathology detection and delineation enables the automatic interpretation of +medical scans such as chest X-rays while providing a high level of +explainability to support radiologists in making informed decisions. However, +annotating pathology bounding boxes is a time-consuming task such that large +public datasets for this purpose are scarce. Current approaches thus use weakly +supervised object detection to learn the (rough) localization of pathologies +from image-level annotations, which is however limited in performance due to +the lack of bounding box supervision. We therefore propose anatomy-driven +pathology detection (ADPD), which uses easy-to-annotate bounding boxes of +anatomical regions as proxies for pathologies. We study two training +approaches: supervised training using anatomy-level pathology labels and +multiple instance learning (MIL) with image-level pathology labels. Our results +show that our anatomy-level training approach outperforms weakly supervised +methods and fully supervised detection with limited training samples, and our +MIL approach is competitive with both baseline approaches, therefore +demonstrating the potential of our approach. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ V1T: large-scale mouse V1 response prediction using a Vision Transformer + + +
+ Accurate predictive models of the visual cortex neural response to natural +visual stimuli remain a challenge in computational neuroscience. In this work, +we introduce V1T, a novel Vision Transformer based architecture that learns a +shared visual and behavioral representation across animals. We evaluate our +model on two large datasets recorded from mouse primary visual cortex and +outperform previous convolution-based models by more than 12.7% in prediction +performance. Moreover, we show that the self-attention weights learned by the +Transformer correlate with the population receptive fields. Our model thus sets +a new benchmark for neural response prediction and can be used jointly with +behavioral and neural recordings to reveal meaningful characteristic features +of the visual cortex. + +
+
+ comment: updated references and added link to code repository; add analysis on + generalization and visualize aRFs; updated with TMLR publication +
+
+
+
+
+ + ♻ ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained language models like ChatGPT have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks. Moreover, in bioinformatics, generating +functional programs poses additional notable challenges due to the amount of +domain knowledge, the need for complicated data operations, and intricate +functional dependencies between the operations. Here, we present BioCoder, a +benchmark developed to evaluate existing pre-trained models in generating +bioinformatics code. In relation to function-code generation, BioCoder covers +potential package dependencies, class declarations, and global variables. It +incorporates 1026 functions and 1243 methods in Python and Java from GitHub and +253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing +framework for evaluation, and we have applied it to evaluate many models +including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, +InstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes +the importance of domain knowledge, pragmatic code generation, and contextual +understanding. Our dataset, benchmark, Docker images, and scripts required for +testing are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ♻ ☆ zPROBE: Zero Peek Robustness Checks for Federated Learning ICCV 2023 + + +
+ Privacy-preserving federated learning allows multiple users to jointly train +a model with coordination of a central server. The server only learns the final +aggregation result, thus the users' (private) training data is not leaked from +the individual model updates. However, keeping the individual updates private +allows malicious users to perform Byzantine attacks and degrade the accuracy +without being detected. Best existing defenses against Byzantine workers rely +on robust rank-based statistics, e.g., median, to find malicious updates. +However, implementing privacy-preserving rank-based statistics is nontrivial +and not scalable in the secure domain, as it requires sorting all individual +updates. We establish the first private robustness check that uses high break +point rank-based statistics on aggregated model updates. By exploiting +randomized clustering, we significantly improve the scalability of our defense +without compromising privacy. We leverage our statistical bounds in +zero-knowledge proofs to detect and remove malicious updates without revealing +the private user updates. Our novel framework, zPROBE, enables Byzantine +resilient and secure federated learning. Empirical evaluations demonstrate that +zPROBE provides a low overhead solution to defend against state-of-the-art +Byzantine attacks while preserving privacy. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Benchmarks for Detecting Measurement Tampering + + +
+ When training powerful AI systems to perform complex tasks, it may be +challenging to provide training signals which are robust to optimization. One +concern is \textit{measurement tampering}, where the AI system manipulates +multiple measurements to create the illusion of good results instead of +achieving the desired outcome. In this work, we build four new text-based +datasets to evaluate measurement tampering detection techniques on large +language models. Concretely, given sets of text inputs and measurements aimed +at determining if some outcome occurred, as well as a base model able to +accurately predict measurements, the goal is to determine if examples where all +measurements indicate the outcome occurred actually had the outcome occur, or +if this was caused by measurement tampering. We demonstrate techniques that +outperform simple baselines on most datasets, but don't achieve maximum +performance. We believe there is significant room for improvement for both +techniques and datasets, and we are excited for future work tackling +measurement tampering. + +
+
+ comment: Edit: extended and improved appendices +
+
+
+
+
+ + ♻ ☆ The star-shaped space of solutions of the spherical negative perceptron + + +
+ Empirical studies on the landscape of neural networks have shown that +low-energy configurations are often found in complex connected structures, +where zero-energy paths between pairs of distant solutions can be constructed. +Here we consider the spherical negative perceptron, a prototypical non-convex +neural network model framed as a continuous constraint satisfaction problem. We +introduce a general analytical method for computing energy barriers in the +simplex with vertex configurations sampled from the equilibrium. We find that +in the over-parameterized regime the solution manifold displays simple +connectivity properties. There exists a large geodesically convex component +that is attractive for a wide range of optimization dynamics. Inside this +region we identify a subset of atypical high-margin solutions that are +geodesically connected with most other solutions, giving rise to a star-shaped +geometry. We analytically characterize the organization of the connected space +of solutions and show numerical evidence of a transition, at larger constraint +densities, where the aforementioned simple geodesic connectivity breaks down. + +
+
+ comment: 27 pages, 16 figures, comments are welcome +
+
+
+
+
+ + ♻ ☆ Learning Efficient Abstract Planning Models that Choose What to Predict + + +
+ An effective approach to solving long-horizon tasks in robotics domains with +continuous state and action spaces is bilevel planning, wherein a high-level +search over an abstraction of an environment is used to guide low-level +decision-making. Recent work has shown how to enable such bilevel planning by +learning abstract models in the form of symbolic operators and neural samplers. +In this work, we show that existing symbolic operator learning approaches fall +short in many robotics domains where a robot's actions tend to cause a large +number of irrelevant changes in the abstract state. This is primarily because +they attempt to learn operators that exactly predict all observed changes in +the abstract state. To overcome this issue, we propose to learn operators that +'choose what to predict' by only modelling changes necessary for abstract +planning to achieve specified goals. Experimentally, we show that our approach +learns operators that lead to efficient planning across 10 different hybrid +robotics domains, including 4 from the challenging BEHAVIOR-100 benchmark, +while generalizing to novel initial states, goals, and objects. + +
+
+
+
+
+ + ♻ ☆ The Descriptive Complexity of Graph Neural Networks + + +
+ We analyse the power of graph neural networks (GNNs) in terms of Boolean +circuit complexity and descriptive complexity. + We prove that the graph queries that can be computed by a polynomial-size +bounded-depth family of GNNs are exactly those definable in the guarded +fragment GFO+C of first-order logic with counting and with built-in relations. +This puts GNNs in the circuit complexity class TC^0. Remarkably, the GNN +families may use arbitrary real weights and a wide class of activation +functions that includes the standard ReLU, logistic "sigmod", and hyperbolic +tangent functions. If the GNNs are allowed to use random initialisation and +global readout (both standard features of GNNs widely used in practice), they +can compute exactly the same queries as bounded depth Boolean circuits with +threshold gates, that is, exactly the queries in TC^0. + Moreover, we show that queries computable by a single GNN with piecewise +linear activations and rational weights are definable in GFO+C without built-in +relations. Therefore, they are contained in uniform TC^0. + +
+
+
+
+
+ + ♻ ☆ Two to Five Truths in Non-Negative Matrix Factorization + + +
+ In this paper, we explore the role of matrix scaling on a matrix of counts +when building a topic model using non-negative matrix factorization. We present +a scaling inspired by the normalized Laplacian (NL) for graphs that can greatly +improve the quality of a non-negative matrix factorization. The results +parallel those in the spectral graph clustering work of \cite{Priebe:2019}, +where the authors proved adjacency spectral embedding (ASE) spectral clustering +was more likely to discover core-periphery partitions and Laplacian Spectral +Embedding (LSE) was more likely to discover affinity partitions. In text +analysis non-negative matrix factorization (NMF) is typically used on a matrix +of co-occurrence ``contexts'' and ``terms" counts. The matrix scaling inspired +by LSE gives significant improvement for text topic models in a variety of +datasets. We illustrate the dramatic difference a matrix scalings in NMF can +greatly improve the quality of a topic model on three datasets where human +annotation is available. Using the adjusted Rand index (ARI), a measure cluster +similarity we see an increase of 50\% for Twitter data and over 200\% for a +newsgroup dataset versus using counts, which is the analogue of ASE. For clean +data, such as those from the Document Understanding Conference, NL gives over +40\% improvement over ASE. We conclude with some analysis of this phenomenon +and some connections of this scaling with other matrix scaling methods. + +
+
+
+
+
+ + ♻ ☆ Coincident Learning for Unsupervised Anomaly Detection + + +
+ Anomaly detection is an important task for complex systems (e.g., industrial +facilities, manufacturing, large-scale science experiments), where failures in +a sub-system can lead to low yield, faulty products, or even damage to +components. While complex systems often have a wealth of data, labeled +anomalies are typically rare (or even nonexistent) and expensive to acquire. +Unsupervised approaches are therefore common and typically search for anomalies +either by distance or density of examples in the input feature space (or some +associated low-dimensional representation). This paper presents a novel +approach called CoAD, which is specifically designed for multi-modal tasks and +identifies anomalies based on \textit{coincident} behavior across two different +slices of the feature space. We define an \textit{unsupervised} metric, +$\hat{F}_\beta$, out of analogy to the supervised classification $F_\beta$ +statistic. CoAD uses $\hat{F}_\beta$ to train an anomaly detection algorithm on +\textit{unlabeled data}, based on the expectation that anomalous behavior in +one feature slice is coincident with anomalous behavior in the other. The +method is illustrated using a synthetic outlier data set and a MNIST-based +image data set, and is compared to prior state-of-the-art on two real-world +tasks: a metal milling data set and a data set from a particle accelerator. + +
+
+
+
+
+ + ♻ ☆ AnoOnly: Semi-Supervised Anomaly Detection without Loss on Normal Data + + +
+ Semi-supervised anomaly detection (SSAD) methods have demonstrated their +effectiveness in enhancing unsupervised anomaly detection (UAD) by leveraging +few-shot but instructive abnormal instances. However, the dominance of +homogeneous normal data over anomalies biases the SSAD models against +effectively perceiving anomalies. To address this issue and achieve balanced +supervision between heavily imbalanced normal and abnormal data, we develop a +novel framework called AnoOnly (Anomaly Only). Unlike existing SSAD methods +that resort to strict loss supervision, AnoOnly suspends it and introduces a +form of weak supervision for normal data. This weak supervision is instantiated +through the utilization of batch normalization, which implicitly performs +cluster learning on normal data. When integrated into existing SSAD methods, +the proposed AnoOnly demonstrates remarkable performance enhancements across +various models and datasets, achieving new state-of-the-art performance. +Additionally, our AnoOnly is natively robust to label noise when suffering from +data contamination. Our code is publicly available at +https://github.com/cool-xuan/AnoOnly. + +
+
+
+
+
+ + ♻ ☆ A novel automatic wind power prediction framework based on multi-time + scale and temporal attention mechanisms + + +
+ Wind energy is a widely distributed, renewable, and environmentally friendly +energy source that plays a crucial role in mitigating global warming and +addressing energy shortages. Nevertheless, wind power generation is +characterized by volatility, intermittence, and randomness, which hinder its +ability to serve as a reliable power source for the grid. Accurate wind power +forecasting is crucial for developing a new power system that heavily relies on +renewable energy sources. However, traditional wind power forecasting systems +primarily focus on ultra-short-term or short-term forecasts, limiting their +ability to address the diverse adjustment requirements of the power system +simultaneously. To overcome these challenges, We propose an automatic framework +capable of forecasting wind power across multi-time scale. The framework based +on the tree-structured Parzen estimator (TPE) and temporal fusion transformer +(TFT) that can provide ultra-short-term, short-term and medium-term wind power +forecasting power.Our approach employs the TFT for wind power forecasting and +categorizes features based on their properties. Additionally, we introduce a +generic algorithm to simultaneously fine-tune the hyperparameters of the +decomposition method and model. We evaluate the performance of our framework by +conducting ablation experiments using three commonly used decomposition +algorithms and six state-of-the-art models for forecasting multi-time scale. +The experimental results demonstrate that our proposed method considerably +improves prediction accuracy on the public dataset Engie +https://opendata-renewables.engie.com. Compared to the second-best +state-of-the-art model, our approach exhibits a reduction of 31.75% and 28.74% +in normalized mean absolute error (nMAE) for 24-hour forecasting, and 20.79% +and 16.93% in nMAE for 48-hour forecasting, respectively. + +
+
+
+
+
+ + ♻ ☆ RESTORE: Graph Embedding Assessment Through Reconstruction + + +
+ Following the success of Word2Vec embeddings, graph embeddings (GEs) have +gained substantial traction. GEs are commonly generated and evaluated +extrinsically on downstream applications, but intrinsic evaluations of the +original graph properties in terms of topological structure and semantic +information have been lacking. Understanding these will help identify the +deficiency of the various families of GE methods when vectorizing graphs in +terms of preserving the relevant knowledge or learning incorrect knowledge. To +address this, we propose RESTORE, a framework for intrinsic GEs assessment +through graph reconstruction. We show that reconstructing the original graph +from the underlying GEs yields insights into the relative amount of information +preserved in a given vector form. We first introduce the graph reconstruction +task. We generate GEs from three GE families based on factorization methods, +random walks, and deep learning (with representative algorithms from each +family) on the CommonSense Knowledge Graph (CSKG). We analyze their +effectiveness in preserving the (a) topological structure of node-level graph +reconstruction with an increasing number of hops and (b) semantic information +on various word semantic and analogy tests. Our evaluations show deep +learning-based GE algorithm (SDNE) is overall better at preserving (a) with a +mean average precision (mAP) of 0.54 and 0.35 for 2 and 3-hop reconstruction +respectively, while the factorization-based algorithm (HOPE) is better at +encapsulating (b) with an average Euclidean distance of 0.14, 0.17, and 0.11 +for 1, 2, and 3-hop reconstruction respectively. The modest performance of +these GEs leaves room for further research avenues on better graph +representation learning. + +
+
+
+
+
+ + ♻ ☆ Towards Long-Tailed Recognition for Graph Classification via + Collaborative Experts + + +
+ Graph classification, aiming at learning the graph-level representations for +effective class assignments, has received outstanding achievements, which +heavily relies on high-quality datasets that have balanced class distribution. +In fact, most real-world graph data naturally presents a long-tailed form, +where the head classes occupy much more samples than the tail classes, it thus +is essential to study the graph-level classification over long-tailed data +while still remaining largely unexplored. However, most existing long-tailed +learning methods in visions fail to jointly optimize the representation +learning and classifier training, as well as neglect the mining of the +hard-to-classify classes. Directly applying existing methods to graphs may lead +to sub-optimal performance, since the model trained on graphs would be more +sensitive to the long-tailed distribution due to the complex topological +characteristics. Hence, in this paper, we propose a novel long-tailed +graph-level classification framework via Collaborative Multi-expert Learning +(CoMe) to tackle the problem. To equilibrate the contributions of head and tail +classes, we first develop balanced contrastive learning from the view of +representation learning, and then design an individual-expert classifier +training based on hard class mining. In addition, we execute gated fusion and +disentangled knowledge distillation among the multiple experts to promote the +collaboration in a multi-expert framework. Comprehensive experiments are +performed on seven widely-used benchmark datasets to demonstrate the +superiority of our method CoMe over state-of-the-art baselines. + +
+
+ comment: Accepted by IEEE Transactions on Big Data (TBD 2024) +
+
+
+
+
+ + ♻ ☆ Where Did the Gap Go? Reassessing the Long-Range Graph Benchmark + + +
+ The recent Long-Range Graph Benchmark (LRGB, Dwivedi et al. 2022) introduced +a set of graph learning tasks strongly dependent on long-range interaction +between vertices. Empirical evidence suggests that on these tasks Graph +Transformers significantly outperform Message Passing GNNs (MPGNNs). In this +paper, we carefully reevaluate multiple MPGNN baselines as well as the Graph +Transformer GPS (Ramp\'a\v{s}ek et al. 2022) on LRGB. Through a rigorous +empirical analysis, we demonstrate that the reported performance gap is +overestimated due to suboptimal hyperparameter choices. It is noteworthy that +across multiple datasets the performance gap completely vanishes after basic +hyperparameter optimization. In addition, we discuss the impact of lacking +feature normalization for LRGB's vision datasets and highlight a spurious +implementation of LRGB's link prediction metric. The principal aim of our paper +is to establish a higher standard of empirical rigor within the graph machine +learning community. + +
+
+
+
+
+ + ♻ ☆ Dynamic Loss For Robust Learning + + +
+ Label noise and class imbalance commonly coexist in real-world data. Previous +works for robust learning, however, usually address either one type of the data +biases and underperform when facing them both. To mitigate this gap, this work +presents a novel meta-learning based dynamic loss that automatically adjusts +the objective functions with the training process to robustly learn a +classifier from long-tailed noisy data. Concretely, our dynamic loss comprises +a label corrector and a margin generator, which respectively correct noisy +labels and generate additive per-class classification margins by perceiving the +underlying data distribution as well as the learning state of the classifier. +Equipped with a new hierarchical sampling strategy that enriches a small amount +of unbiased metadata with diverse and hard samples, the two components in the +dynamic loss are optimized jointly through meta-learning and cultivate the +classifier to well adapt to clean and balanced test data. Extensive experiments +show our method achieves state-of-the-art accuracy on multiple real-world and +synthetic datasets with various types of data biases, including CIFAR-10/100, +Animal-10N, ImageNet-LT, and Webvision. Code will soon be publicly available. + +
+
+
+
+
+ + ♻ ☆ When Measures are Unreliable: Imperceptible Adversarial Perturbations + toward Top-$k$ Multi-Label Learning ACM MM 2023 + + +
+ With the great success of deep neural networks, adversarial learning has +received widespread attention in various studies, ranging from multi-class +learning to multi-label learning. However, existing adversarial attacks toward +multi-label learning only pursue the traditional visual imperceptibility but +ignore the new perceptible problem coming from measures such as Precision@$k$ +and mAP@$k$. Specifically, when a well-trained multi-label classifier performs +far below the expectation on some samples, the victim can easily realize that +this performance degeneration stems from attack, rather than the model itself. +Therefore, an ideal multi-labeling adversarial attack should manage to not only +deceive visual perception but also evade monitoring of measures. To this end, +this paper first proposes the concept of measure imperceptibility. Then, a +novel loss function is devised to generate such adversarial perturbations that +could achieve both visual and measure imperceptibility. Furthermore, an +efficient algorithm, which enjoys a convex objective, is established to +optimize this objective. Finally, extensive experiments on large-scale +benchmark datasets, such as PASCAL VOC 2012, MS COCO, and NUS WIDE, demonstrate +the superiority of our proposed method in attacking the top-$k$ multi-label +systems. + +
+
+ comment: 22 pages, 7 figures, accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ How Expressive are Graph Neural Networks in Recommendation? CIKM + + +
+ Graph Neural Networks (GNNs) have demonstrated superior performance on +various graph learning tasks, including recommendation, where they leverage +user-item collaborative filtering signals in graphs. However, theoretical +formulations of their capability are scarce, despite their empirical +effectiveness in state-of-the-art recommender models. Recently, research has +explored the expressiveness of GNNs in general, demonstrating that message +passing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that +GNNs combined with random node initialization are universal. Nevertheless, the +concept of "expressiveness" for GNNs remains vaguely defined. Most existing +works adopt the graph isomorphism test as the metric of expressiveness, but +this graph-level task may not effectively assess a model's ability in +recommendation, where the objective is to distinguish nodes of different +closeness. In this paper, we provide a comprehensive theoretical analysis of +the expressiveness of GNNs in recommendation, considering three levels of +expressiveness metrics: graph isomorphism (graph-level), node automorphism +(node-level), and topological closeness (link-level). We propose the +topological closeness metric to evaluate GNNs' ability to capture the +structural distance between nodes, which aligns closely with the objective of +recommendation. To validate the effectiveness of this new metric in evaluating +recommendation performance, we introduce a learning-less GNN algorithm that is +optimal on the new metric and can be optimal on the node-level metric with +suitable modification. We conduct extensive experiments comparing the proposed +algorithm against various types of state-of-the-art GNN models to explore the +explainability of the new metric in the recommendation task. For +reproducibility, implementation codes are available at +https://github.com/HKUDS/GTE. + +
+
+ comment: 32nd ACM International Conference on Information and Knowledge + Management (CIKM) 2023 +
+
+
+
+
+ + ♻ ☆ From Random Search to Bandit Learning in Metric Measure Spaces + + +
+ Random Search is one of the most widely-used method for Hyperparameter +Optimization, and is critical to the success of deep learning models. Despite +its astonishing performance, little non-heuristic theory has been developed to +describe the underlying working mechanism. This paper gives a theoretical +accounting of Random Search. We introduce the concept of scattering dimension +that describes the landscape of the underlying function, and quantifies the +performance of random search. We show that, when the environment is noise-free, +the output of random search converges to the optimal value in probability at +rate $ \widetilde{\mathcal{O}} \left( \left( \frac{1}{T} \right)^{ +\frac{1}{d_s} } \right) $, where $ d_s \ge 0 $ is the scattering dimension of +the underlying function. When the observed function values are corrupted by +bounded $iid$ noise, the output of random search converges to the optimal value +in probability at rate $ \widetilde{\mathcal{O}} \left( \left( \frac{1}{T} +\right)^{ \frac{1}{d_s + 1} } \right) $. In addition, based on the principles +of random search, we introduce an algorithm, called BLiN-MOS, for Lipschitz +bandits in doubling metric spaces that are also endowed with a probability +measure, and show that under certain conditions, BLiN-MOS achieves a regret +rate of order $ \widetilde{\mathcal{O}} \left( T^{ \frac{d_z}{d_z + 1} } +\right) $, where $d_z$ is the zooming dimension of the problem instance. + +
+
+
+
+
+ + ♻ ☆ Automated GI tract segmentation using deep learning + + +
+ The job of Radiation oncologists is to deliver x-ray beams pointed toward the +tumor and at the same time avoid the stomach and intestines. With MR-Linacs +(magnetic resonance imaging and linear accelerator systems), oncologists can +visualize the position of the tumor and allow for precise dose according to +tumor cell presence which can vary from day to day. The current job of +outlining the position of the stomach and intestines to adjust the X-ray beams +direction for the dose delivery to the tumor while avoiding the organs. This is +a time-consuming and labor-intensive process that can easily prolong treatments +from 15 minutes to an hour a day unless deep learning methods can automate the +segmentation process. This paper discusses an automated segmentation process +using deep learning to make this process faster and allow more patients to get +effective treatment. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Introspective Deep Metric Learning for Image Retrieval + + +
+ This paper proposes an introspective deep metric learning (IDML) framework +for uncertainty-aware comparisons of images. Conventional deep metric learning +methods produce confident semantic distances between images regardless of the +uncertainty level. However, we argue that a good similarity model should +consider the semantic discrepancies with caution to better deal with ambiguous +images for more robust training. To achieve this, we propose to represent an +image using not only a semantic embedding but also an accompanying uncertainty +embedding, which describes the semantic characteristics and ambiguity of an +image, respectively. We further propose an introspective similarity metric to +make similarity judgments between images considering both their semantic +differences and ambiguities. The proposed IDML framework improves the +performance of deep metric learning through uncertainty modeling and attains +state-of-the-art results on the widely used CUB-200-2011, Cars196, and Stanford +Online Products datasets for image retrieval and clustering. We further provide +an in-depth analysis of our framework to demonstrate the effectiveness and +reliability of IDML. Code is available at: https://github.com/wzzheng/IDML. + +
+
+ comment: The extended version of this paper is accepted to T-PAMI. Source code + available at https://github.com/wzzheng/IDML +
+
+
+
+
+ + ♻ ☆ Precipitation nowcasting with generative diffusion models + + +
+ In recent years traditional numerical methods for accurate weather prediction +have been increasingly challenged by deep learning methods. Numerous historical +datasets used for short and medium-range weather forecasts are typically +organized into a regular spatial grid structure. This arrangement closely +resembles images: each weather variable can be visualized as a map or, when +considering the temporal axis, as a video. Several classes of generative +models, comprising Generative Adversarial Networks, Variational Autoencoders, +or the recent Denoising Diffusion Models have largely proved their +applicability to the next-frame prediction problem, and is thus natural to test +their performance on the weather prediction benchmarks. Diffusion models are +particularly appealing in this context, due to the intrinsically probabilistic +nature of weather forecasting: what we are really interested to model is the +probability distribution of weather indicators, whose expected value is the +most likely prediction. + In our study, we focus on a specific subset of the ERA-5 dataset, which +includes hourly data pertaining to Central Europe from the years 2016 to 2021. +Within this context, we examine the efficacy of diffusion models in handling +the task of precipitation nowcasting. Our work is conducted in comparison to +the performance of well-established U-Net models, as documented in the existing +literature. Our proposed approach of Generative Ensemble Diffusion (GED) +utilizes a diffusion model to generate a set of possible weather scenarios +which are then amalgamated into a probable prediction via the use of a +post-processing network. This approach, in comparison to recent deep learning +models, substantially outperformed them in terms of overall performance. + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Minimax Weight Learning for Absorbing MDPs + + +
+ Reinforcement learning policy evaluation problems are often modeled as finite +or discounted/averaged infinite-horizon MDPs. In this paper, we study +undiscounted off-policy policy evaluation for absorbing MDPs. Given the dataset +consisting of the i.i.d episodes with a given truncation level, we propose a +so-called MWLA algorithm to directly estimate the expected return via the +importance ratio of the state-action occupancy measure. The Mean Square Error +(MSE) bound for the MWLA method is investigated and the dependence of +statistical errors on the data size and the truncation level are analyzed. With +an episodic taxi environment, computational experiments illustrate the +performance of the MWLA algorithm. + +
+
+ comment: 36 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Knowledge-informed Molecular Learning: A Survey on Paradigm Transfer + + +
+ Machine learning, notably deep learning, has significantly propelled +molecular investigations within the biochemical sphere. Traditionally, modeling +for such research has centered around a handful of paradigms. For instance, the +prediction paradigm is frequently deployed for tasks such as molecular property +prediction. To enhance the generation and decipherability of purely data-driven +models, scholars have integrated biochemical domain knowledge into these +molecular study models. This integration has sparked a surge in paradigm +transfer, which is solving one molecular learning task by reformulating it as +another one. With the emergence of Large Language Models, these paradigms have +demonstrated an escalating trend towards harmonized unification. In this work, +we delineate a literature survey focused on knowledge-informed molecular +learning from the perspective of paradigm transfer. We classify the paradigms, +scrutinize their methodologies, and dissect the contribution of domain +knowledge. Moreover, we encapsulate prevailing trends and identify intriguing +avenues for future exploration in molecular learning. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Multi-level Multiple Instance Learning with Transformer for Whole Slide + Image Classification + + +
+ Whole slide image (WSI) refers to a type of high-resolution scanned tissue +image, which is extensively employed in computer-assisted diagnosis (CAD). The +extremely high resolution and limited availability of region-level annotations +make employing deep learning methods for WSI-based digital diagnosis +challenging. Recently integrating multiple instance learning (MIL) and +Transformer for WSI analysis shows very promising results. However, designing +effective Transformers for this weakly-supervised high-resolution image +analysis is an underexplored yet important problem. In this paper, we propose a +Multi-level MIL (MMIL) scheme by introducing a hierarchical structure to MIL, +which enables efficient handling of MIL tasks involving a large number of +instances. Based on MMIL, we instantiated MMIL-Transformer, an efficient +Transformer model with windowed exact self-attention for large-scale MIL tasks. +To validate its effectiveness, we conducted a set of experiments on WSI +classification tasks, where MMIL-Transformer demonstrate superior performance +compared to existing state-of-the-art methods, i.e., 96.80% test AUC and 97.67% +test accuracy on the CAMELYON16 dataset, 99.04% test AUC and 94.37% test +accuracy on the TCGA-NSCLC dataset, respectively. All code and pre-trained +models are available at: https://github.com/hustvl/MMIL-Transformer + +
+
+
+
+
+ + ♻ ☆ Generative Network-Based Reduced-Order Model for Prediction, Data + Assimilation and Uncertainty Quantification + + +
+ We propose a new method in which a generative network (GN) integrate into a +reduced-order model (ROM) framework is used to solve inverse problems for +partial differential equations (PDE). The aim is to match available +measurements and estimate the corresponding uncertainties associated with the +states and parameters of a numerical physical simulation. The GN is trained +using only unconditional simulations of the discretized PDE model. We compare +the proposed method with the golden standard Markov chain Monte Carlo. We apply +the proposed approaches to a spatio-temporal compartmental model in +epidemiology. The results show that the proposed GN-based ROM can efficiently +quantify uncertainty and accurately match the measurements and the golden +standard, using only a few unconditional simulations of the full-order +numerical PDE model. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2105.07729 +
+
+
+
+
+ + ♻ ☆ Correlation visualization under missing values: a comparison between + imputation and direct parameter estimation methods + + +
+ Correlation matrix visualization is essential for understanding the +relationships between variables in a dataset, but missing data can pose a +significant challenge in estimating correlation coefficients. In this paper, we +compare the effects of various missing data methods on the correlation plot, +focusing on two common missing patterns: random and monotone. We aim to provide +practical strategies and recommendations for researchers and practitioners in +creating and analyzing the correlation plot. Our experimental results suggest +that while imputation is commonly used for missing data, using imputed data for +plotting the correlation matrix may lead to a significantly misleading +inference of the relation between the features. We recommend using DPER, a +direct parameter estimation approach, for plotting the correlation matrix based +on its performance in the experiments. + +
+
+
+
+
+ + ♻ ☆ Dual Correction Strategy for Ranking Distillation in Top-N Recommender + System CIKM 2021 + + +
+ Knowledge Distillation (KD), which transfers the knowledge of a well-trained +large model (teacher) to a small model (student), has become an important area +of research for practical deployment of recommender systems. Recently, Relaxed +Ranking Distillation (RRD) has shown that distilling the ranking information in +the recommendation list significantly improves the performance. However, the +method still has limitations in that 1) it does not fully utilize the +prediction errors of the student model, which makes the training not fully +efficient, and 2) it only distills the user-side ranking information, which +provides an insufficient view under the sparse implicit feedback. This paper +presents Dual Correction strategy for Distillation (DCD), which transfers the +ranking information from the teacher model to the student model in a more +efficient manner. Most importantly, DCD uses the discrepancy between the +teacher model and the student model predictions to decide which knowledge to be +distilled. By doing so, DCD essentially provides the learning guidance tailored +to "correcting" what the student model has failed to accurately predict. This +process is applied for transferring the ranking information from the user-side +as well as the item-side to address sparse implicit user feedback. Our +experiments show that the proposed method outperforms the state-of-the-art +baselines, and ablation studies validate the effectiveness of each component. + +
+
+ comment: CIKM 2021 +
+
+
+
+
+ + ♻ ☆ Nonlinear Independent Component Analysis for Principled Disentanglement + in Unsupervised Deep Learning + + +
+ A central problem in unsupervised deep learning is how to find useful +representations of high-dimensional data, sometimes called "disentanglement". +Most approaches are heuristic and lack a proper theoretical foundation. In +linear representation learning, independent component analysis (ICA) has been +successful in many applications areas, and it is principled, i.e., based on a +well-defined probabilistic model. However, extension of ICA to the nonlinear +case has been problematic due to the lack of identifiability, i.e., uniqueness +of the representation. Recently, nonlinear extensions that utilize temporal +structure or some auxiliary information have been proposed. Such models are in +fact identifiable, and consequently, an increasing number of algorithms have +been developed. In particular, some self-supervised algorithms can be shown to +estimate nonlinear ICA, even though they have initially been proposed from +heuristic perspectives. This paper reviews the state-of-the-art of nonlinear +ICA theory and algorithms. + +
+
+ comment: Revised version, to appear in Patterns +
+
+
+
+
+ + ♻ ☆ StratMed: Relevance Stratification for Low-resource Medication + Recommendation + + +
+ With the growing imbalance between limited medical resources and escalating +demands, AI-based clinical tasks have become paramount. Medication +recommendation, as a sub-domain, aims to amalgamate longitudinal patient +history with medical knowledge, assisting physicians in prescribing safer and +more accurate medication combinations. Existing methods overlook the inherent +long-tail distribution in medical data, lacking balanced representation between +head and tail data, which leads to sub-optimal model performance. To address +this challenge, we introduce StratMed, a model that incorporates an innovative +relevance stratification mechanism. It harmonizes discrepancies in data +long-tail distribution and strikes a balance between the safety and accuracy of +medication combinations. Specifically, we first construct a pre-training method +using deep learning networks to obtain entity representation. After that, we +design a pyramid-like data stratification method to obtain more generalized +entity relationships by reinforcing the features of unpopular entities. Based +on this relationship, we designed two graph structures to express medication +precision and safety at the same level to obtain visit representations. +Finally, the patient's historical clinical information is fitted to generate +medication combinations for the current health condition. Experiments on the +MIMIC-III dataset demonstrate that our method has outperformed current +state-of-the-art methods in four evaluation metrics (including safety and +accuracy). + +
+
+
+
+
+ + ♻ ☆ DELTA: Dynamic Embedding Learning with Truncated Conscious Attention for + CTR Prediction + + +
+ Click-Through Rate (CTR) prediction is a pivotal task in product and content +recommendation, where learning effective feature embeddings is of great +significance. However, traditional methods typically learn fixed feature +representations without dynamically refining feature representations according +to the context information, leading to suboptimal performance. Some recent +approaches attempt to address this issue by learning bit-wise weights or +augmented embeddings for feature representations, but suffer from uninformative +or redundant features in the context. To tackle this problem, inspired by the +Global Workspace Theory in conscious processing, which posits that only a +specific subset of the product features are pertinent while the rest can be +noisy and even detrimental to human-click behaviors, we propose a CTR model +that enables Dynamic Embedding Learning with Truncated Conscious Attention for +CTR prediction, termed DELTA. DELTA contains two key components: (I) conscious +truncation module (CTM), which utilizes curriculum learning to apply adaptive +truncation on attention weights to select the most critical feature in the +context; (II) explicit embedding optimization (EEO), which applies an auxiliary +task during training that directly and independently propagates the gradient +from the loss layer to the embedding layer, thereby optimizing the embedding +explicitly via linear feature crossing. Extensive experiments on five +challenging CTR datasets demonstrate that DELTA achieves new state-of-art +performance among current CTR methods. + +
+
+
+
+
+ + ♻ ☆ Toward Leveraging Pre-Trained Self-Supervised Frontends for Automatic + Singing Voice Understanding Tasks: Three Case Studies SC 2023 + + +
+ Automatic singing voice understanding tasks, such as singer identification, +singing voice transcription, and singing technique classification, benefit from +data-driven approaches that utilize deep learning techniques. These approaches +work well even under the rich diversity of vocal and noisy samples owing to +their representation ability. However, the limited availability of labeled data +remains a significant obstacle to achieving satisfactory performance. In recent +years, self-supervised learning models (SSL models) have been trained using +large amounts of unlabeled data in the field of speech processing and music +classification. By fine-tuning these models for the target tasks, comparable +performance to conventional supervised learning can be achieved with limited +training data. Therefore, in this paper, we investigate the effectiveness of +SSL models for various singing voice recognition tasks. We report the results +of experiments comparing SSL models for three different tasks (i.e., singer +identification, singing voice transcription, and singing technique +classification) as initial exploration and aim to discuss these findings. +Experimental results show that each SSL model achieves comparable performance +and sometimes outperforms compared to state-of-the-art methods on each task. We +also conducted a layer-wise analysis to further understand the behavior of the +SSL models. + +
+
+ comment: Accepted at APSIPA ASC 2023 +
+
+
+
+
+ + ♻ ☆ Boosting the Adversarial Transferability of Surrogate Models with Dark + Knowledge ICTAI + + +
+ Deep neural networks (DNNs) are vulnerable to adversarial examples. And, the +adversarial examples have transferability, which means that an adversarial +example for a DNN model can fool another model with a non-trivial probability. +This gave birth to the transfer-based attack where the adversarial examples +generated by a surrogate model are used to conduct black-box attacks. There are +some work on generating the adversarial examples from a given surrogate model +with better transferability. However, training a special surrogate model to +generate adversarial examples with better transferability is relatively +under-explored. This paper proposes a method for training a surrogate model +with dark knowledge to boost the transferability of the adversarial examples +generated by the surrogate model. This trained surrogate model is named dark +surrogate model (DSM). The proposed method for training a DSM consists of two +key components: a teacher model extracting dark knowledge, and the mixing +augmentation skill enhancing dark knowledge of training data. We conducted +extensive experiments to show that the proposed method can substantially +improve the adversarial transferability of surrogate models across different +architectures of surrogate models and optimizers for generating adversarial +examples, and it can be applied to other scenarios of transfer-based attack +that contain dark knowledge, like face verification. Our code is publicly +available at \url{https://github.com/ydc123/Dark_Surrogate_Model}. + +
+
+ comment: Accepted at 2023 International Conference on Tools with Artificial + Intelligence (ICTAI) +
+
+
+
+
+ + ♻ ☆ Breaking the Lower Bound with (Little) Structure: Acceleration in + Non-Convex Stochastic Optimization with Heavy-Tailed Noise + + +
+ We consider the stochastic optimization problem with smooth but not +necessarily convex objectives in the heavy-tailed noise regime, where the +stochastic gradient's noise is assumed to have bounded $p$th moment +($p\in(1,2]$). Zhang et al. (2020) is the first to prove the +$\Omega(T^{\frac{1-p}{3p-2}})$ lower bound for convergence (in expectation) and +provides a simple clipping algorithm that matches this optimal rate. Cutkosky +and Mehta (2021) proposes another algorithm, which is shown to achieve the +nearly optimal high-probability convergence guarantee +$O(\log(T/\delta)T^{\frac{1-p}{3p-2}})$, where $\delta$ is the probability of +failure. However, this desirable guarantee is only established under the +additional assumption that the stochastic gradient itself is bounded in $p$th +moment, which fails to hold even for quadratic objectives and centered Gaussian +noise. + In this work, we first improve the analysis of the algorithm in Cutkosky and +Mehta (2021) to obtain the same nearly optimal high-probability convergence +rate $O(\log(T/\delta)T^{\frac{1-p}{3p-2}})$, without the above-mentioned +restrictive assumption. Next, and curiously, we show that one can achieve a +faster rate than that dictated by the lower bound +$\Omega(T^{\frac{1-p}{3p-2}})$ with only a tiny bit of structure, i.e., when +the objective function $F(x)$ is assumed to be in the form of +$\mathbb{E}_{\Xi\sim\mathcal{D}}[f(x,\Xi)]$, arguably the most widely +applicable class of stochastic optimization problems. For this class of +problems, we propose the first variance-reduced accelerated algorithm and +establish that it guarantees a high-probability convergence rate of +$O(\log(T/\delta)T^{\frac{1-p}{2p-1}})$ under a mild condition, which is faster +than $\Omega(T^{\frac{1-p}{3p-2}})$. Notably, even when specialized to the +finite-variance case, our result yields the (near-)optimal high-probability +rate $O(\log(T/\delta)T^{-1/3})$. + +
+
+
+
+
+ + ♻ ☆ Does Misclassifying Non-confounding Covariates as Confounders Affect the + Causal Inference within the Potential Outcomes Framework? + + +
+ The Potential Outcome Framework (POF) plays a prominent role in the field of +causal inference. Most causal inference models based on the POF (CIMs-POF) are +designed for eliminating confounding bias and default to an underlying +assumption of Confounding Covariates. This assumption posits that the +covariates consist solely of confounders. However, the assumption of +Confounding Covariates is challenging to maintain in practice, particularly +when dealing with high-dimensional covariates. While certain methods have been +proposed to differentiate the distinct components of covariates prior to +conducting causal inference, the consequences of treating non-confounding +covariates as confounders remain unclear. This ambiguity poses a potential risk +when conducting causal inference in practical scenarios. In this paper, we +present a unified graphical framework for the CIMs-POF, which greatly enhances +the comprehension of these models' underlying principles. Using this graphical +framework, we quantitatively analyze the extent to which the inference +performance of CIMs-POF is influenced when incorporating various types of +non-confounding covariates, such as instrumental variables, mediators, +colliders, and adjustment variables. The key findings are: in the task of +eliminating confounding bias, the optimal scenario is for the covariates to +exclusively encompass confounders; in the subsequent task of inferring +counterfactual outcomes, the adjustment variables contribute to more accurate +inferences. Furthermore, extensive experiments conducted on synthetic datasets +consistently validate these theoretical conclusions. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Benchmarking Jetson Edge Devices with an End-to-end Video-based Anomaly + Detection System + + +
+ Innovative enhancement in embedded system platforms, specifically hardware +accelerations, significantly influence the application of deep learning in +real-world scenarios. These innovations translate human labor efforts into +automated intelligent systems employed in various areas such as autonomous +driving, robotics, Internet-of-Things (IoT), and numerous other impactful +applications. NVIDIA's Jetson platform is one of the pioneers in offering +optimal performance regarding energy efficiency and throughput in the execution +of deep learning algorithms. Previously, most benchmarking analysis was based +on 2D images with a single deep learning model for each comparison result. In +this paper, we implement an end-to-end video-based crime-scene anomaly +detection system inputting from surveillance videos and the system is deployed +and completely operates on multiple Jetson edge devices (Nano, AGX Xavier, Orin +Nano). The comparison analysis includes the integration of Torch-TensorRT as a +software developer kit from NVIDIA for the model performance optimisation. The +system is built based on the PySlowfast open-source project from Facebook as +the coding template. The end-to-end system process comprises the videos from +camera, data preprocessing pipeline, feature extractor and the anomaly +detection. We provide the experience of an AI-based system deployment on +various Jetson Edge devices with Docker technology. Regarding anomaly +detectors, a weakly supervised video-based deep learning model called Robust +Temporal Feature Magnitude Learning (RTFM) is applied in the system. The +approach system reaches 47.56 frames per second (FPS) inference speed on a +Jetson edge device with only 3.11 GB RAM usage total. We also discover the +promising Jetson device that the AI system achieves 15% better performance than +the previous version of Jetson devices while consuming 50% less energy power. + +
+
+ comment: 18 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Event-based Dynamic Graph Representation Learning for Patent Application + Trend Prediction + + +
+ Accurate prediction of what types of patents that companies will apply for in +the next period of time can figure out their development strategies and help +them discover potential partners or competitors in advance. Although important, +this problem has been rarely studied in previous research due to the challenges +in modelling companies' continuously evolving preferences and capturing the +semantic correlations of classification codes. To fill in this gap, we propose +an event-based dynamic graph learning framework for patent application trend +prediction. In particular, our method is founded on the memorable +representations of both companies and patent classification codes. When a new +patent is observed, the representations of the related companies and +classification codes are updated according to the historical memories and the +currently encoded messages. Moreover, a hierarchical message passing mechanism +is provided to capture the semantic proximities of patent classification codes +by updating their representations along the hierarchical taxonomy. Finally, the +patent application trend is predicted by aggregating the representations of the +target company and classification codes from static, dynamic, and hierarchical +perspectives. Experiments on real-world data demonstrate the effectiveness of +our approach under various experimental conditions, and also reveal the +abilities of our method in learning semantics of classification codes and +tracking technology developing trajectories of companies. + +
+
+ comment: Accepted by the TKDE journal +
+
+
+
+
+ + ♻ ☆ LLaMA-Reviewer: Advancing Code Review Automation with Large Language + Models through Parameter-Efficient Fine-Tuning + + +
+ The automation of code review activities, a long-standing pursuit in software +engineering, has been primarily addressed by numerous domain-specific +pre-trained models. Despite their success, these models frequently demand +extensive resources for pre-training from scratch. In contrast, Large Language +Models (LLMs) provide an intriguing alternative, given their remarkable +capabilities when supplemented with domain-specific knowledge. However, their +potential for automating code review tasks remains largely unexplored. + In response to this research gap, we present LLaMA-Reviewer, an innovative +framework that leverages the capabilities of LLaMA, a popular LLM, in the realm +of code review. Mindful of resource constraints, this framework employs +parameter-efficient fine-tuning (PEFT) methods, delivering high performance +while using less than 1% of trainable parameters. + An extensive evaluation of LLaMA-Reviewer is conducted on two diverse, +publicly available datasets. Notably, even with the smallest LLaMA base model +consisting of 6.7B parameters and a limited number of tuning epochs, +LLaMA-Reviewer equals the performance of existing code-review-focused models. + The ablation experiments provide insights into the influence of various +fine-tuning process components, including input representation, instruction +tuning, and different PEFT methods. To foster continuous progress in this +field, the code and all PEFT-weight plugins have been made open-source. + +
+
+ comment: Accepted to the 34th IEEE International Symposium on Software + Reliability Engineering (ISSRE 2023) +
+
+
+
+
+ + ♻ ☆ Learning to Sample Tasks for Meta Learning + + +
+ Through experiments on various meta-learning methods, task samplers, and +few-shot learning tasks, this paper arrives at three conclusions. Firstly, +there are no universal task sampling strategies to guarantee the performance of +meta-learning models. Secondly, task diversity can cause the models to either +underfit or overfit during training. Lastly, the generalization performance of +the models are influenced by task divergence, task entropy, and task +difficulty. In response to these findings, we propose a novel task sampler +called Adaptive Sampler (ASr). ASr is a plug-and-play task sampler that takes +task divergence, task entropy, and task difficulty to sample tasks. To optimize +ASr, we rethink and propose a simple and general meta-learning algorithm. +Finally, a large number of empirical experiments demonstrate the effectiveness +of the proposed ASr. + +
+
+ comment: 10 pages, 7 tables, 3 figures +
+
+
+
+
+ + ♻ ☆ RS2G: Data-Driven Scene-Graph Extraction and Embedding for Robust + Autonomous Perception and Scenario Understanding + + +
+ Effectively capturing intricate interactions among road users is of critical +importance to achieving safe navigation for autonomous vehicles. While graph +learning (GL) has emerged as a promising approach to tackle this challenge, +existing GL models rely on predefined domain-specific graph extraction rules +that often fail in real-world drastically changing scenarios. Additionally, +these graph extraction rules severely impede the capability of existing GL +methods to generalize knowledge across domains. To address this issue, we +propose RoadScene2Graph (RS2G), an innovative autonomous scenario understanding +framework with a novel data-driven graph extraction and modeling approach that +dynamically captures the diverse relations among road users. Our evaluations +demonstrate that on average RS2G outperforms the state-of-the-art (SOTA) +rule-based graph extraction method by 4.47% and the SOTA deep learning model by +22.19% in subjective risk assessment. More importantly, RS2G delivers notably +better performance in transferring knowledge gained from simulation +environments to unseen real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ Machine Learning Based IoT Adaptive Architecture for Epilepsy Seizure + Detection: Anatomy and Analysis + + +
+ A seizure tracking system is crucial for monitoring and evaluating epilepsy +treatments. Caretaker seizure diaries are used in epilepsy care today, but +clinical seizure monitoring may miss seizures. Monitoring devices that can be +worn may be better tolerated and more suitable for long-term ambulatory use. +Many techniques and methods are proposed for seizure detection; However, +simplicity and affordability are key concepts for daily use while preserving +the accuracy of the detection. In this study, we propose a versal, affordable +noninvasive based on a simple real-time k-Nearest-Neighbors (kNN) machine +learning that can be customized and adapted to individual users in less than +four seconds of training time; the system was verified and validated using 500 +subjects, with seizure detection data sampled at 178 Hz, the operated with a +mean accuracy of (94.5%). + +
+
+ comment: Under review, 5 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Acceleration in Policy Optimization + + +
+ We work towards a unifying paradigm for accelerating policy optimization +methods in reinforcement learning (RL) by integrating foresight in the policy +improvement step via optimistic and adaptive updates. Leveraging the connection +between policy iteration and policy gradient methods, we view policy +optimization algorithms as iteratively solving a sequence of surrogate +objectives, local lower bounds on the original objective. We define optimism as +predictive modelling of the future behavior of a policy, and adaptivity as +taking immediate and anticipatory corrective actions to mitigate accumulating +errors from overshooting predictions or delayed responses to change. We use +this shared lens to jointly express other well-known algorithms, including +model-based policy improvement based on forward search, and optimistic +meta-learning algorithms. We analyze properties of this formulation, and show +connections to other accelerated optimization algorithms. Then, we design an +optimistic policy gradient algorithm, adaptive via meta-gradient learning, and +empirically highlight several design choices pertaining to acceleration, in an +illustrative task. + +
+
+
+
+
+ + ♻ ☆ Fed-CPrompt: Contrastive Prompt for Rehearsal-Free Federated Continual + Learning ICML 2023 + + +
+ Federated continual learning (FCL) learns incremental tasks over time from +confidential datasets distributed across clients. This paper focuses on +rehearsal-free FCL, which has severe forgetting issues when learning new tasks +due to the lack of access to historical task data. To address this issue, we +propose Fed-CPrompt based on prompt learning techniques to obtain task-specific +prompts in a communication-efficient way. Fed-CPrompt introduces two key +components, asynchronous prompt learning, and contrastive continual loss, to +handle asynchronous task arrival and heterogeneous data distributions in FCL, +respectively. Extensive experiments demonstrate the effectiveness of +Fed-CPrompt in achieving SOTA rehearsal-free FCL performance. + +
+
+ comment: Accepted by FL-ICML 2023 +
+
+
+
+
+ + ♻ ☆ EdgeServe: A Streaming System for Decentralized Model Serving + + +
+ The relevant features for a machine learning task may be aggregated from data +sources collected on different nodes in a network. This problem, which we call +decentralized prediction, creates a number of interesting systems challenges in +managing data routing, placing computation, and time-synchronization. This +paper presents EdgeServe, a machine learning system that can serve +decentralized predictions. EdgeServe relies on a low-latency message broker to +route data through a network to nodes that can serve predictions. EdgeServe +relies on a series of novel optimizations that can tradeoff computation, +communication, and accuracy. We evaluate EdgeServe on three decentralized +prediction tasks: (1) multi-camera object tracking, (2) network intrusion +detection, and (3) human activity recognition. + +
+
+ comment: 13 pages, 12 figures; added experiments +
+
+
+
+
+ + ♻ ☆ Revisiting Adversarial Attacks on Graph Neural Networks for Graph + Classification + + +
+ Graph neural networks (GNNs) have achieved tremendous success in the task of +graph classification and its diverse downstream real-world applications. +Despite the huge success in learning graph representations, current GNN models +have demonstrated their vulnerability to potentially existent adversarial +examples on graph-structured data. Existing approaches are either limited to +structure attacks or restricted to local information, urging for the design of +a more general attack framework on graph classification, which faces +significant challenges due to the complexity of generating local-node-level +adversarial examples using the global-graph-level information. To address this +"global-to-local" attack challenge, we present a novel and general framework to +generate adversarial examples via manipulating graph structure and node +features. Specifically, we make use of Graph Class Activation Mapping and its +variant to produce node-level importance corresponding to the graph +classification task. Then through a heuristic design of algorithms, we can +perform both feature and structure attacks under unnoticeable perturbation +budgets with the help of both node-level and subgraph-level importance. +Experiments towards attacking four state-of-the-art graph classification models +on six real-world benchmarks verify the flexibility and effectiveness of our +framework. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ T Cell Receptor Protein Sequences and Sparse Coding: A Novel Approach to + Cancer Classification ICONIP 2023 + + +
+ Cancer is a complex disease characterized by uncontrolled cell growth and +proliferation. T cell receptors (TCRs) are essential proteins for the adaptive +immune system, and their specific recognition of antigens plays a crucial role +in the immune response against diseases, including cancer. The diversity and +specificity of TCRs make them ideal for targeting cancer cells, and recent +advancements in sequencing technologies have enabled the comprehensive +profiling of TCR repertoires. This has led to the discovery of TCRs with potent +anti-cancer activity and the development of TCR-based immunotherapies. In this +study, we investigate the use of sparse coding for the multi-class +classification of TCR protein sequences with cancer categories as target +labels. Sparse coding is a popular technique in machine learning that enables +the representation of data with a set of informative features and can capture +complex relationships between amino acids and identify subtle patterns in the +sequence that might be missed by low-dimensional methods. We first compute the +k-mers from the TCR sequences and then apply sparse coding to capture the +essential features of the data. To improve the predictive performance of the +final embeddings, we integrate domain knowledge regarding different types of +cancer properties. We then train different machine learning (linear and +non-linear) classifiers on the embeddings of TCR sequences for the purpose of +supervised analysis. Our proposed embedding method on a benchmark dataset of +TCR sequences significantly outperforms the baselines in terms of predictive +performance, achieving an accuracy of 99.8\%. Our study highlights the +potential of sparse coding for the analysis of TCR protein sequences in cancer +research and other related fields. + +
+
+ comment: Accepted at ICONIP 2023 +
+
+
+
+
+ + ♻ ☆ Verification against in-situ observations for Data-Driven Weather + Prediction NeurIPS + + +
+ Data-driven weather prediction models (DDWPs) have made rapid strides in +recent years, demonstrating an ability to approximate Numerical Weather +Prediction (NWP) models to a high degree of accuracy. The fast, accurate, and +low-cost DDWP forecasts make their use in operational forecasting an attractive +proposition, however, there remains work to be done in rigorously evaluating +DDWPs in a true operational setting. Typically trained and evaluated using ERA5 +reanalysis data, DDWPs have been tested only in a simulation, which cannot +represent the real world with complete accuracy even if it is of a very high +quality. The safe use of DDWPs in operational forecasting requires more +thorough "real-world" verification, as well as a careful examination of how +DDWPs are currently trained and evaluated. It is worth asking, for instance, +how well do the reanalysis datasets, used for training, simulate the real +world? With an eye towards climate justice and the uneven availability of +weather data: is the simulation equally good for all regions of the world, and +would DDWPs exacerbate biases present in the training data? Does a good +performance in simulation correspond to good performance in operational +settings? In addition to approximating the physics of NWP models, how can ML be +uniquely deployed to provide more accurate weather forecasts? As a first step +towards answering such questions, we present a robust dataset of in-situ +observations derived from the NOAA MADIS program to serve as a benchmark to +validate DDWPs in an operational setting. By providing a large corpus of +quality-controlled, in-situ observations, this dataset provides a meaningful +real-world task that all NWPs and DDWPs can be tested against. We hope that +this data can be used not only to rigorously and fairly compare operational +weather models but also to spur future research in new directions. + +
+
+ comment: 10 pages, 6 figures, under review at NeurIPS main conference +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Prototype-based Dataset Comparison ICCV 2023 + + +
+ Dataset summarisation is a fruitful approach to dataset inspection. However, +when applied to a single dataset the discovery of visual concepts is restricted +to those most prominent. We argue that a comparative approach can expand upon +this paradigm to enable richer forms of dataset inspection that go beyond the +most prominent concepts. To enable dataset comparison we present a module that +learns concept-level prototypes across datasets. We leverage self-supervised +learning to discover these prototypes without supervision, and we demonstrate +the benefits of our approach in two case-studies. Our findings show that +dataset comparison extends dataset inspection and we hope to encourage more +works in this direction. Code and usage instructions available at +https://github.com/Nanne/ProtoSim + +
+
+ comment: To be presented at ICCV 2023 +
+
+
+
+
+ + ☆ Hybrid Design of Multiplicative Watermarking for Defense Against + Malicious Parameter Identification + + +
+ Watermarking is a promising active diagnosis technique for detection of +highly sophisticated attacks, but is vulnerable to malicious agents that use +eavesdropped data to identify and then remove or replicate the watermark. In +this work, we propose a hybrid multiplicative watermarking (HMWM) scheme, where +the watermark parameters are periodically updated, following the dynamics of +the unobservable states of specifically designed piecewise affine (PWA) hybrid +systems. We provide a theoretical analysis of the effects of this scheme on the +closed-loop performance, and prove that stability properties are preserved. +Additionally, we show that the proposed approach makes it difficult for an +eavesdropper to reconstruct the watermarking parameters, both in terms of the +associated computational complexity and from a systems theoretic perspective. + +
+
+ comment: 8 pages, first submission to the 62nd IEEE Conference on Decision and + Control +
+
+
+
+
+ + ☆ Exploring the Intersection of Complex Aesthetics and Generative AI for + Promoting Cultural Creativity in Rural China after the Post-Pandemic Era + + +
+ This paper explores using generative AI and aesthetics to promote cultural +creativity in rural China amidst COVID-19's impact. Through literature reviews, +case studies, surveys, and text analysis, it examines art and technology +applications in rural contexts and identifies key challenges. The study finds +artworks often fail to resonate locally, while reliance on external artists +limits sustainability. Hence, nurturing grassroots "artist villagers" through +AI is proposed. Our approach involves training machine learning on subjective +aesthetics to generate culturally relevant content. Interactive AI media can +also boost tourism while preserving heritage. This pioneering research puts +forth original perspectives on the intersection of AI and aesthetics to +invigorate rural culture. It advocates holistic integration of technology and +emphasizes AI's potential as a creative enabler versus replacement. Ultimately, +it lays the groundwork for further exploration of leveraging AI innovations to +empower rural communities. This timely study contributes to growing interest in +emerging technologies to address critical issues facing rural China. + +
+
+ comment: Accepted by 2023 the 1st International Conference on AI-generated + Content (AIGC2023) +
+
+
+
+
+ + ☆ Towards Diverse and Consistent Typography Generation + + +
+ In this work, we consider the typography generation task that aims at +producing diverse typographic styling for the given graphic document. We +formulate typography generation as a fine-grained attribute generation for +multiple text elements and build an autoregressive model to generate diverse +typography that matches the input design context. We further propose a simple +yet effective sampling approach that respects the consistency and distinction +principle of typography so that generated examples share consistent typographic +styling across text elements. Our empirical study shows that our model +successfully generates diverse typographic designs while preserving a +consistent typographic structure. + +
+
+
+
+
+ + ☆ A Survey on Interpretable Cross-modal Reasoning + + +
+ In recent years, cross-modal reasoning (CMR), the process of understanding +and reasoning across different modalities, has emerged as a pivotal area with +applications spanning from multimedia analysis to healthcare diagnostics. As +the deployment of AI systems becomes more ubiquitous, the demand for +transparency and comprehensibility in these systems' decision-making processes +has intensified. This survey delves into the realm of interpretable cross-modal +reasoning (I-CMR), where the objective is not only to achieve high predictive +performance but also to provide human-understandable explanations for the +results. This survey presents a comprehensive overview of the typical methods +with a three-level taxonomy for I-CMR. Furthermore, this survey reviews the +existing CMR datasets with annotations for explanations. Finally, this survey +summarizes the challenges for I-CMR and discusses potential future directions. +In conclusion, this survey aims to catalyze the progress of this emerging +research area by providing researchers with a panoramic and comprehensive +perspective, illuminating the state of the art and discerning the +opportunities. + +
+
+
+
+
+ + ☆ Gradient Domain Diffusion Models for Image Synthesis + + +
+ Diffusion models are getting popular in generative image and video synthesis. +However, due to the diffusion process, they require a large number of steps to +converge. To tackle this issue, in this paper, we propose to perform the +diffusion process in the gradient domain, where the convergence becomes faster. +There are two reasons. First, thanks to the Poisson equation, the gradient +domain is mathematically equivalent to the original image domain. Therefore, +each diffusion step in the image domain has a unique corresponding gradient +domain representation. Second, the gradient domain is much sparser than the +image domain. As a result, gradient domain diffusion models converge faster. +Several numerical experiments confirm that the gradient domain diffusion models +are more efficient than the original diffusion models. The proposed method can +be applied in a wide range of applications such as image processing, computer +vision and machine learning tasks. + +
+
+
+
+
+ + ☆ Symbolic Music Representations for Classification Tasks: A Systematic + Evaluation + + +
+ Music Information Retrieval (MIR) has seen a recent surge in deep +learning-based approaches, which often involve encoding symbolic music (i.e., +music represented in terms of discrete note events) in an image-like or +language like fashion. However, symbolic music is neither an image nor a +sentence, and research in the symbolic domain lacks a comprehensive overview of +the different available representations. In this paper, we investigate matrix +(piano roll), sequence, and graph representations and their corresponding +neural architectures, in combination with symbolic scores and performances on +three piece-level classification tasks. We also introduce a novel graph +representation for symbolic performances and explore the capability of graph +representations in global classification tasks. Our systematic evaluation shows +advantages and limitations of each input representation. Our results suggest +that the graph representation, as the newest and least explored among the three +approaches, exhibits promising performance, while being more light-weight in +training. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 42 + +
+
+
+ + ☆ Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition + and Translation + + +
+ In this paper, we devise a mechanism for the addition of multi-modal +information with an existing pipeline for continuous sign language recognition +and translation. In our procedure, we have incorporated optical flow +information with RGB images to enrich the features with movement-related +information. This work studies the feasibility of such modality inclusion using +a cross-modal encoder. The plugin we have used is very lightweight and doesn't +need to include a separate feature extractor for the new modality in an +end-to-end manner. We have applied the changes in both sign language +recognition and translation, improving the result in each case. We have +evaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language +recognition and the RWTH-PHOENIX-2014T dataset for translation. On the +recognition task, our approach reduced the WER by 0.9, and on the translation +task, our approach increased most of the BLEU scores by ~0.6 on the test set. + +
+
+
+
+
+ + ☆ One Wide Feedforward is All You Need + + +
+ The Transformer architecture has two main non-embedding components: Attention +and the Feed Forward Network (FFN). Attention captures interdependencies +between words regardless of their position, while the FFN non-linearly +transforms each input token independently. In this work we explore the role of +the FFN, and find that despite taking up a significant fraction of the model's +parameters, it is highly redundant. Concretely, we are able to substantially +reduce the number of parameters with only a modest drop in accuracy by removing +the FFN on the decoder layers and sharing a single FFN across the encoder. +Finally we scale this architecture back to its original size by increasing the +hidden dimension of the shared FFN, achieving substantial gains in both +accuracy and latency with respect to the original Transformer Big. + +
+
+
+
+
+ + ☆ Into the Single Cell Multiverse: an End-to-End Dataset for Procedural + Knowledge Extraction in Biomedical Texts NeurIPS 2023 + + +
+ Many of the most commonly explored natural language processing (NLP) +information extraction tasks can be thought of as evaluations of declarative +knowledge, or fact-based information extraction. Procedural knowledge +extraction, i.e., breaking down a described process into a series of steps, has +received much less attention, perhaps in part due to the lack of structured +datasets that capture the knowledge extraction process from end-to-end. To +address this unmet need, we present FlaMB\'e (Flow annotations for Multiverse +Biological entities), a collection of expert-curated datasets across a series +of complementary tasks that capture procedural knowledge in biomedical texts. +This dataset is inspired by the observation that one ubiquitous source of +procedural knowledge that is described as unstructured text is within academic +papers describing their methodology. The workflows annotated in FlaMB\'e are +from texts in the burgeoning field of single cell research, a research area +that has become notorious for the number of software tools and complexity of +workflows used. Additionally, FlaMB\'e provides, to our knowledge, the largest +manually curated named entity recognition (NER) and disambiguation (NED) +datasets for tissue/cell type, a fundamental biological entity that is critical +for knowledge extraction in the biomedical research domain. Beyond providing a +valuable dataset to enable further development of NLP models for procedural +knowledge extraction, automating the process of workflow mining also has +important implications for advancing reproducibility in biomedical research. + +
+
+ comment: Submitted to NeurIPS 2023 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Are Emergent Abilities in Large Language Models just In-Context + Learning? + + +
+ Large language models have exhibited emergent abilities, demonstrating +exceptional performance across diverse tasks for which they were not explicitly +trained, including those that require complex reasoning abilities. The +emergence of such abilities carries profound implications for the future +direction of research in NLP, especially as the deployment of such models +becomes more prevalent. However, one key challenge is that the evaluation of +these abilities is often confounded by competencies that arise in models +through alternative prompting techniques, such as in-context learning and +instruction following, which also emerge as the models are scaled up. In this +study, we provide the first comprehensive examination of these emergent +abilities while accounting for various potentially biasing factors that can +influence the evaluation of models. We conduct rigorous tests on a set of 18 +models, encompassing a parameter range from 60 million to 175 billion +parameters, across a comprehensive set of 22 tasks. Through an extensive series +of over 1,000 experiments, we provide compelling evidence that emergent +abilities can primarily be ascribed to in-context learning. We find no evidence +for the emergence of reasoning abilities, thus providing valuable insights into +the underlying mechanisms driving the observed abilities and thus alleviating +safety concerns regarding their use. + +
+
+ comment: Code available at https://github.com/UKPLab/on-emergence and data + available at https://tudatalib.ulb.tu-darmstadt.de/handle/tudatalib/3931 +
+
+
+
+
+ + ☆ An Empirical Analysis for Zero-Shot Multi-Label Classification on + COVID-19 CT Scans and Uncurated Reports ICCV + + +
+ The pandemic resulted in vast repositories of unstructured data, including +radiology reports, due to increased medical examinations. Previous research on +automated diagnosis of COVID-19 primarily focuses on X-ray images, despite +their lower precision compared to computed tomography (CT) scans. In this work, +we leverage unstructured data from a hospital and harness the fine-grained +details offered by CT scans to perform zero-shot multi-label classification +based on contrastive visual language learning. In collaboration with human +experts, we investigate the effectiveness of multiple zero-shot models that aid +radiologists in detecting pulmonary embolisms and identifying intricate lung +details like ground glass opacities and consolidations. Our empirical analysis +provides an overview of the possible solutions to target such fine-grained +tasks, so far overlooked in the medical multimodal pretraining literature. Our +investigation promises future advancements in the medical image analysis +community by addressing some challenges associated with unstructured data and +fine-grained multi-label classification. + +
+
+ comment: 10 pages, 3 figures, Proceedings of the IEEE/CVF International + Conference on Computer Vision (ICCV) Workshops 2023 +
+
+
+
+
+ + ☆ Interdisciplinary Fairness in Imbalanced Research Proposal Topic + Inference: A Hierarchical Transformer-based Method with Selective + Interpolation + + +
+ The objective of topic inference in research proposals aims to obtain the +most suitable disciplinary division from the discipline system defined by a +funding agency. The agency will subsequently find appropriate peer review +experts from their database based on this division. Automated topic inference +can reduce human errors caused by manual topic filling, bridge the knowledge +gap between funding agencies and project applicants, and improve system +efficiency. Existing methods focus on modeling this as a hierarchical +multi-label classification problem, using generative models to iteratively +infer the most appropriate topic information. However, these methods overlook +the gap in scale between interdisciplinary research proposals and +non-interdisciplinary ones, leading to an unjust phenomenon where the automated +inference system categorizes interdisciplinary proposals as +non-interdisciplinary, causing unfairness during the expert assignment. How can +we address this data imbalance issue under a complex discipline system and +hence resolve this unfairness? In this paper, we implement a topic label +inference system based on a Transformer encoder-decoder architecture. +Furthermore, we utilize interpolation techniques to create a series of +pseudo-interdisciplinary proposals from non-interdisciplinary ones during +training based on non-parametric indicators such as cross-topic probabilities +and topic occurrence probabilities. This approach aims to reduce the bias of +the system during model training. Finally, we conduct extensive experiments on +a real-world dataset to verify the effectiveness of the proposed method. The +experimental results demonstrate that our training strategy can significantly +mitigate the unfairness generated in the topic inference task. + +
+
+ comment: 19 pages, Under review. arXiv admin note: text overlap with + arXiv:2209.13912 +
+
+
+
+
+ + ☆ Prompting or Fine-tuning? A Comparative Study of Large Language Models + for Taxonomy Construction + + +
+ Taxonomies represent hierarchical relations between entities, frequently +applied in various software modeling and natural language processing (NLP) +activities. They are typically subject to a set of structural constraints +restricting their content. However, manual taxonomy construction can be +time-consuming, incomplete, and costly to maintain. Recent studies of large +language models (LLMs) have demonstrated that appropriate user inputs (called +prompting) can effectively guide LLMs, such as GPT-3, in diverse NLP tasks +without explicit (re-)training. However, existing approaches for automated +taxonomy construction typically involve fine-tuning a language model by +adjusting model parameters. In this paper, we present a general framework for +taxonomy construction that takes into account structural constraints. We +subsequently conduct a systematic comparison between the prompting and +fine-tuning approaches performed on a hypernym taxonomy and a novel computer +science taxonomy dataset. Our result reveals the following: (1) Even without +explicit training on the dataset, the prompting approach outperforms +fine-tuning-based approaches. Moreover, the performance gap between prompting +and fine-tuning widens when the training dataset is small. However, (2) +taxonomies generated by the fine-tuning approach can be easily post-processed +to satisfy all the constraints, whereas handling violations of the taxonomies +produced by the prompting approach can be challenging. These evaluation +findings provide guidance on selecting the appropriate method for taxonomy +construction and highlight potential enhancements for both approaches. + +
+
+ comment: Accepted by MDE Intelligence 2023 +
+
+
+
+
+ + ☆ MathAttack: Attacking Large Language Models Towards Math Solving Ability + + +
+ With the boom of Large Language Models (LLMs), the research of solving Math +Word Problem (MWP) has recently made great progress. However, there are few +studies to examine the security of LLMs in math solving ability. Instead of +attacking prompts in the use of LLMs, we propose a MathAttack model to attack +MWP samples which are closer to the essence of security in solving math +problems. Compared to traditional text adversarial attack, it is essential to +preserve the mathematical logic of original MWPs during the attacking. To this +end, we propose logical entity recognition to identify logical entries which +are then frozen. Subsequently, the remaining text are attacked by adopting a +word-level attacker. Furthermore, we propose a new dataset RobustMath to +evaluate the robustness of LLMs in math solving ability. Extensive experiments +on our RobustMath and two another math benchmark datasets GSM8K and MultiAirth +show that MathAttack could effectively attack the math solving ability of LLMs. +In the experiments, we observe that (1) Our adversarial samples from +higher-accuracy LLMs are also effective for attacking LLMs with lower accuracy +(e.g., transfer from larger to smaller-size LLMs, or from few-shot to zero-shot +prompts); (2) Complex MWPs (such as more solving steps, longer text, more +numbers) are more vulnerable to attack; (3) We can improve the robustness of +LLMs by using our adversarial samples in few-shot prompts. Finally, we hope our +practice and observation can serve as an important attempt towards enhancing +the robustness of LLMs in math solving ability. We will release our code and +dataset. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ CRUISE-Screening: Living Literature Reviews Toolbox CIKM 2023 + + +
+ Keeping up with research and finding related work is still a time-consuming +task for academics. Researchers sift through thousands of studies to identify a +few relevant ones. Automation techniques can help by increasing the efficiency +and effectiveness of this task. To this end, we developed CRUISE-Screening, a +web-based application for conducting living literature reviews - a type of +literature review that is continuously updated to reflect the latest research +in a particular field. CRUISE-Screening is connected to several search engines +via an API, which allows for updating the search results periodically. +Moreover, it can facilitate the process of screening for relevant publications +by using text classification and question answering models. CRUISE-Screening +can be used both by researchers conducting literature reviews and by those +working on automating the citation screening process to validate their +algorithms. The application is open-source: +https://github.com/ProjectDoSSIER/cruise-screening, and a demo is available +under this URL: https://citation-screening.ec.tuwien.ac.at. We discuss the +limitations of our tool in Appendix A. + +
+
+ comment: Paper accepted at CIKM 2023. The arXiv version has an extra section + about limitations in the Appendix that is not present in the ACM version +
+
+
+
+
+ + ☆ Donkii: Can Annotation Error Detection Methods Find Errors in + Instruction-Tuning Datasets? + + +
+ Instruction-tuning has become an integral part of training pipelines for +Large Language Models (LLMs) and has been shown to yield strong performance +gains. In an orthogonal line of research, Annotation Error Detection (AED) has +emerged as a tool for detecting quality issues of gold-standard labels. But so +far, the application of AED methods is limited to discriminative settings. It +is an open question how well AED methods generalize to generative settings +which are becoming widespread via generative LLMs. In this work, we present a +first and new benchmark for AED on instruction-tuning data: Donkii. It +encompasses three instruction-tuning datasets enriched with annotations by +experts and semi-automatic methods. We find that all three datasets contain +clear-cut errors that sometimes directly propagate into instruction-tuned LLMs. +We propose four AED baselines for the generative setting and evaluate them +comprehensively on the newly introduced dataset. Our results demonstrate that +choosing the right AED method and model size is indeed crucial, thereby +deriving practical recommendations. To gain insights, we provide a first +case-study to examine how the quality of the instruction-tuning datasets +influences downstream performance. + +
+
+
+
+
+ + ☆ Fine-grained Affective Processing Capabilities Emerging from Large + Language Models + + +
+ Large language models, in particular generative pre-trained transformers +(GPTs), show impressive results on a wide variety of language-related tasks. In +this paper, we explore ChatGPT's zero-shot ability to perform affective +computing tasks using prompting alone. We show that ChatGPT a) performs +meaningful sentiment analysis in the Valence, Arousal and Dominance dimensions, +b) has meaningful emotion representations in terms of emotion categories and +these affective dimensions, and c) can perform basic appraisal-based emotion +elicitation of situations based on a prompt-based computational implementation +of the OCC appraisal model. These findings are highly relevant: First, they +show that the ability to solve complex affect processing tasks emerges from +language-based token prediction trained on extensive data sets. Second, they +show the potential of large language models for simulating, processing and +analyzing human emotions, which has important implications for various +applications such as sentiment analysis, socially interactive agents, and +social robotics. + +
+
+
+
+
+ + ☆ Unveiling Theory of Mind in Large Language Models: A Parallel to Single + Neurons in the Human Brain + + +
+ With their recent development, large language models (LLMs) have been found +to exhibit a certain level of Theory of Mind (ToM), a complex cognitive +capacity that is related to our conscious mind and that allows us to infer +another's beliefs and perspective. While human ToM capabilities are believed to +derive from the neural activity of a broadly interconnected brain network, +including that of dorsal medial prefrontal cortex (dmPFC) neurons, the precise +processes underlying LLM's capacity for ToM or their similarities with that of +humans remains largely unknown. In this study, we drew inspiration from the +dmPFC neurons subserving human ToM and employed a similar methodology to +examine whether LLMs exhibit comparable characteristics. Surprisingly, our +analysis revealed a striking resemblance between the two, as hidden embeddings +(artificial neurons) within LLMs started to exhibit significant responsiveness +to either true- or false-belief trials, suggesting their ability to represent +another's perspective. These artificial embedding responses were closely +correlated with the LLMs' performance during the ToM tasks, a property that was +dependent on the size of the models. Further, the other's beliefs could be +accurately decoded using the entire embeddings, indicating the presence of the +embeddings' ToM capability at the population level. Together, our findings +revealed an emergent property of LLMs' embeddings that modified their +activities in response to ToM features, offering initial evidence of a parallel +between the artificial model and neurons in the human brain. + +
+
+
+
+
+ + ☆ Evolving linguistic divergence on polarizing social media + + +
+ Language change is influenced by many factors, but often starts from +synchronic variation, where multiple linguistic patterns or forms coexist, or +where different speech communities use language in increasingly different ways. +Besides regional or economic reasons, communities may form and segregate based +on political alignment. The latter, referred to as political polarization, is +of growing societal concern across the world. Here we map and quantify +linguistic divergence across the partisan left-right divide in the United +States, using social media data. We develop a general methodology to delineate +(social) media users by their political preference, based on which (potentially +biased) news media accounts they do and do not follow on a given platform. Our +data consists of 1.5M short posts by 10k users (about 20M words) from the +social media platform Twitter (now "X"). Delineating this sample involved +mining the platform for the lists of followers (n=422M) of 72 large news media +accounts. We quantify divergence in topics of conversation and word +frequencies, messaging sentiment, and lexical semantics of words and emoji. We +find signs of linguistic divergence across all these aspects, especially in +topics and themes of conversation, in line with previous research. While US +American English remains largely intelligible within its large speech +community, our findings point at areas where miscommunication may eventually +arise given ongoing polarization and therefore potential linguistic divergence. +Our methodology - combining data mining, lexicostatistics, machine learning, +large language models and a systematic human annotation approach - is largely +language and platform agnostic. In other words, while we focus here on US +political divides and US English, the same approach is applicable to other +countries, languages, and social media platforms. + +
+
+
+
+
+ + ☆ Exploring the effectiveness of ChatGPT-based feedback compared with + teacher feedback and self-feedback: Evidence from Chinese to English + translation + + +
+ ChatGPT,a cutting-edge AI-powered Chatbot,can quickly generate responses on +given commands. While it was reported that ChatGPT had the capacity to deliver +useful feedback, it is still unclear about its effectiveness compared with +conventional feedback approaches,such as teacher feedback (TF) and +self-feedback (SF). To address this issue, this study compared the revised +Chinese to English translation texts produced by Chinese Master of Translation +and Interpretation (MTI) students,who learned English as a Second/Foreign +Language (ESL/EFL), based on three feedback types (i.e., ChatGPT-based +feedback, TF and SF). The data was analyzed using BLEU score to gauge the +overall translation quality as well as Coh-Metrix to examine linguistic +features across three dimensions: lexicon, syntax, and cohesion.The findings +revealed that TF- and SF-guided translation texts surpassed those with +ChatGPT-based feedback, as indicated by the BLEU score. In terms of linguistic +features,ChatGPT-based feedback demonstrated superiority, particularly in +enhancing lexical capability and referential cohesion in the translation texts. +However, TF and SF proved more effective in developing syntax-related skills,as +it addressed instances of incorrect usage of the passive voice. These diverse +outcomes indicate ChatGPT's potential as a supplementary resource, +complementing traditional teacher-led methods in translation practice. + +
+
+
+
+
+ + ☆ Critical Behavioral Traits Foster Peer Engagement in Online Mental + Health Communities + + +
+ Online Mental Health Communities (OMHCs), such as Reddit, have witnessed a +surge in popularity as go-to platforms for seeking information and support in +managing mental health needs. Platforms like Reddit offer immediate +interactions with peers, granting users a vital space for seeking mental health +assistance. However, the largely unregulated nature of these platforms +introduces intricate challenges for both users and society at large. This study +explores the factors that drive peer engagement within counseling threads, +aiming to enhance our understanding of this critical phenomenon. We introduce +BeCOPE, a novel behavior encoded Peer counseling dataset comprising over 10,118 +posts and 58,279 comments sourced from 21 mental health-specific subreddits. +The dataset is annotated using three major fine-grained behavior labels: (a) +intent, (b) criticism, and (c) readability, along with the emotion labels. Our +analysis indicates the prominence of ``self-criticism'' as the most prevalent +form of criticism expressed by help-seekers, accounting for a significant 43% +of interactions. Intriguingly, we observe that individuals who explicitly +express their need for help are 18.01% more likely to receive assistance +compared to those who present ``surveys'' or engage in ``rants.'' Furthermore, +we highlight the pivotal role of well-articulated problem descriptions, showing +that superior readability effectively doubles the likelihood of receiving the +sought-after support. Our study emphasizes the essential role of OMHCs in +offering personalized guidance and unveils behavior-driven engagement patterns. + +
+
+
+
+
+ + ☆ Geo-Encoder: A Chunk-Argument Bi-Encoder Framework for Chinese + Geographic Re-Ranking + + +
+ Chinese geographic re-ranking task aims to find the most relevant addresses +among retrieved candidates, which is crucial for location-related services such +as navigation maps. Unlike the general sentences, geographic contexts are +closely intertwined with geographical concepts, from general spans (e.g., +province) to specific spans (e.g., road). Given this feature, we propose an +innovative framework, namely Geo-Encoder, to more effectively integrate Chinese +geographical semantics into re-ranking pipelines. Our methodology begins by +employing off-the-shelf tools to associate text with geographical spans, +treating them as chunking units. Then, we present a multi-task learning module +to simultaneously acquire an effective attention matrix that determines chunk +contributions to extra semantic representations. Furthermore, we put forth an +asynchronous update mechanism for the proposed addition task, aiming to guide +the model capable of effectively focusing on specific chunks. Experiments on +two distinct Chinese geographic re-ranking datasets, show that the Geo-Encoder +achieves significant improvements when compared to state-of-the-art baselines. +Notably, it leads to a substantial improvement in the Hit@1 score of MGEO-BERT, +increasing it by 6.22% from 62.76 to 68.98 on the GeoTES dataset. + +
+
+
+
+
+ + ☆ A Comparative Analysis of Pretrained Language Models for Text-to-Speech ISCA + + +
+ State-of-the-art text-to-speech (TTS) systems have utilized pretrained +language models (PLMs) to enhance prosody and create more natural-sounding +speech. However, while PLMs have been extensively researched for natural +language understanding (NLU), their impact on TTS has been overlooked. In this +study, we aim to address this gap by conducting a comparative analysis of +different PLMs for two TTS tasks: prosody prediction and pause prediction. +Firstly, we trained a prosody prediction model using 15 different PLMs. Our +findings revealed a logarithmic relationship between model size and quality, as +well as significant performance differences between neutral and expressive +prosody. Secondly, we employed PLMs for pause prediction and found that the +task was less sensitive to small models. We also identified a strong +correlation between our empirical results and the GLUE scores obtained for +these language models. To the best of our knowledge, this is the first study of +its kind to investigate the impact of different PLMs on TTS. + +
+
+ comment: Accepted for presentation at the 12th ISCA Speech Synthesis Workshop + (SSW) in Grenoble, France, from 26th to 28th August 2023 +
+
+
+
+
+ + ☆ ChatRule: Mining Logical Rules with Large Language Models for Knowledge + Graph Reasoning + + +
+ Logical rules are essential for uncovering the logical connections between +relations, which could improve the reasoning performance and provide +interpretable results on knowledge graphs (KGs). Although there have been many +efforts to mine meaningful logical rules over KGs, existing methods suffer from +the computationally intensive searches over the rule space and a lack of +scalability for large-scale KGs. Besides, they often ignore the semantics of +relations which is crucial for uncovering logical connections. Recently, large +language models (LLMs) have shown impressive performance in the field of +natural language processing and various applications, owing to their emergent +ability and generalizability. In this paper, we propose a novel framework, +ChatRule, unleashing the power of large language models for mining logical +rules over knowledge graphs. Specifically, the framework is initiated with an +LLM-based rule generator, leveraging both the semantic and structural +information of KGs to prompt LLMs to generate logical rules. To refine the +generated rules, a rule ranking module estimates the rule quality by +incorporating facts from existing KGs. Last, a rule validator harnesses the +reasoning ability of LLMs to validate the logical correctness of ranked rules +through chain-of-thought reasoning. ChatRule is evaluated on four large-scale +KGs, w.r.t. different rule quality metrics and downstream tasks, showing the +effectiveness and scalability of our method. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ☆ What are Public Concerns about ChatGPT? A Novel Self-Supervised Neural + Topic Model Tells You + + +
+ The recently released artificial intelligence conversational agent, ChatGPT, +has gained significant attention in academia and real life. A multitude of +early ChatGPT users eagerly explore its capabilities and share their opinions +on it via social media. Both user queries and social media posts express public +concerns regarding this advanced dialogue system. To mine public concerns about +ChatGPT, a novel Self-Supervised neural Topic Model (SSTM), which formalizes +topic modeling as a representation learning procedure, is proposed in this +paper. Extensive experiments have been conducted on Twitter posts about ChatGPT +and queries asked by ChatGPT users. And experimental results demonstrate that +the proposed approach could extract higher quality public concerns with +improved interpretability and diversity, surpassing the performance of +state-of-the-art approaches. + +
+
+
+
+
+ + ☆ LLM and Infrastructure as a Code use case + + +
+ Cloud computing and the evolution of management methodologies such as Lean +Management or Agile entail a profound transformation in both system +construction and maintenance approaches. These practices are encompassed within +the term "DevOps." This descriptive approach to an information system or +application, alongside the configuration of its constituent components, has +necessitated the development of descriptive languages paired with specialized +engines for automating systems administration tasks. Among these, the tandem of +Ansible (engine) and YAML (descriptive language) stands out as the two most +prevalent tools in the market, facing notable competition mainly from +Terraform. The current document presents an inquiry into a solution for +generating and managing Ansible YAML roles and playbooks, utilizing Generative +LLMs (Language Models) to translate human descriptions into code. Our efforts +are focused on identifying plausible directions and outlining the potential +industrial applications. + Note: For the purpose of this experiment, we have opted against the use of +Ansible Lightspeed. This is due to its reliance on an IBM Watson model, for +which we have not found any publicly available references. Comprehensive +information regarding this remarkable technology can be found directly on our +partner RedHat's website, +https://www.redhat.com/en/about/press-releases/red-hat-introduces-ansible-lightspeed-ai-driven-it-automation + +
+
+ comment: in French language +
+
+
+
+
+ + ☆ NumHG: A Dataset for Number-Focused Headline Generation SemEval-2024 + + +
+ Headline generation, a key task in abstractive summarization, strives to +condense a full-length article into a succinct, single line of text. Notably, +while contemporary encoder-decoder models excel based on the ROUGE metric, they +often falter when it comes to the precise generation of numerals in headlines. +We identify the lack of datasets providing fine-grained annotations for +accurate numeral generation as a major roadblock. To address this, we introduce +a new dataset, the NumHG, and provide over 27,000 annotated numeral-rich news +articles for detailed investigation. Further, we evaluate five well-performing +models from previous headline generation tasks using human evaluation in terms +of numerical accuracy, reasonableness, and readability. Our study reveals a +need for improvement in numerical accuracy, demonstrating the potential of the +NumHG dataset to drive progress in number-focused headline generation and +stimulate further discussions in numeral-focused text generation. + +
+
+ comment: NumEval@SemEval-2024 Dataset +
+
+
+
+
+ + ☆ Open Sesame! Universal Black Box Jailbreaking of Large Language Models + + +
+ Large language models (LLMs), designed to provide helpful and safe responses, +often rely on alignment techniques to align with user intent and social +guidelines. Unfortunately, this alignment can be exploited by malicious actors +seeking to manipulate an LLM's outputs for unintended purposes. In this paper +we introduce a novel approach that employs a genetic algorithm (GA) to +manipulate LLMs when model architecture and parameters are inaccessible. The GA +attack works by optimizing a universal adversarial prompt that -- when combined +with a user's query -- disrupts the attacked model's alignment, resulting in +unintended and potentially harmful outputs. Our novel approach systematically +reveals a model's limitations and vulnerabilities by uncovering instances where +its responses deviate from expected behavior. Through extensive experiments we +demonstrate the efficacy of our technique, thus contributing to the ongoing +discussion on responsible AI development by providing a diagnostic tool for +evaluating and enhancing alignment of LLMs with human intent. To our knowledge +this is the first automated universal black box jailbreak attack. + +
+
+
+
+
+ + ☆ SememeASR: Boosting Performance of End-to-End Speech Recognition against + Domain and Long-Tailed Data Shift with Sememe Semantic Knowledge INTERSPEECH 2023 + + +
+ Recently, excellent progress has been made in speech recognition. However, +pure data-driven approaches have struggled to solve the problem in +domain-mismatch and long-tailed data. Considering that knowledge-driven +approaches can help data-driven approaches alleviate their flaws, we introduce +sememe-based semantic knowledge information to speech recognition (SememeASR). +Sememe, according to the linguistic definition, is the minimum semantic unit in +a language and is able to represent the implicit semantic information behind +each word very well. Our experiments show that the introduction of sememe +information can improve the effectiveness of speech recognition. In addition, +our further experiments show that sememe knowledge can improve the model's +recognition of long-tailed data and enhance the model's domain generalization +ability. + +
+
+ comment: Accepted by INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Benchmarking Large Language Models in Retrieval-Augmented Generation + + +
+ Retrieval-Augmented Generation (RAG) is a promising approach for mitigating +the hallucination of large language models (LLMs). However, existing research +lacks rigorous evaluation of the impact of retrieval-augmented generation on +different large language models, which make it challenging to identify the +potential bottlenecks in the capabilities of RAG for different LLMs. In this +paper, we systematically investigate the impact of Retrieval-Augmented +Generation on large language models. We analyze the performance of different +large language models in 4 fundamental abilities required for RAG, including +noise robustness, negative rejection, information integration, and +counterfactual robustness. To this end, we establish Retrieval-Augmented +Generation Benchmark (RGB), a new corpus for RAG evaluation in both English and +Chinese. RGB divides the instances within the benchmark into 4 separate +testbeds based on the aforementioned fundamental abilities required to resolve +the case. Then we evaluate 6 representative LLMs on RGB to diagnose the +challenges of current LLMs when applying RAG. Evaluation reveals that while +LLMs exhibit a certain degree of noise robustness, they still struggle +significantly in terms of negative rejection, information integration, and +dealing with false information. The aforementioned assessment outcomes indicate +that there is still a considerable journey ahead to effectively apply RAG to +LLMs. + +
+
+
+
+
+ + ☆ Hateful Messages: A Conversational Data Set of Hate Speech produced by + Adolescents on Discord + + +
+ With the rise of social media, a rise of hateful content can be observed. +Even though the understanding and definitions of hate speech varies, platforms, +communities, and legislature all acknowledge the problem. Therefore, +adolescents are a new and active group of social media users. The majority of +adolescents experience or witness online hate speech. Research in the field of +automated hate speech classification has been on the rise and focuses on +aspects such as bias, generalizability, and performance. To increase +generalizability and performance, it is important to understand biases within +the data. This research addresses the bias of youth language within hate speech +classification and contributes by providing a modern and anonymized hate speech +youth language data set consisting of 88.395 annotated chat messages. The data +set consists of publicly available online messages from the chat platform +Discord. ~6,42% of the messages were classified by a self-developed annotation +schema as hate speech. For 35.553 messages, the user profiles provided age +annotations setting the average author age to under 20 years old. + +
+
+
+
+
+ + ☆ Zero-shot information extraction from radiological reports using ChatGPT + + +
+ Electronic health records contain an enormous amount of valuable information, +but many are recorded in free text. Information extraction is the strategy to +transform the sequence of characters into structured data, which can be +employed for secondary analysis. However, the traditional information +extraction components, such as named entity recognition and relation +extraction, require annotated data to optimize the model parameters, which has +become one of the major bottlenecks in building information extraction systems. +With the large language models achieving good performances on various +downstream NLP tasks without parameter tuning, it becomes possible to use large +language models for zero-shot information extraction. In this study, we aim to +explore whether the most popular large language model, ChatGPT, can extract +useful information from the radiological reports. We first design the prompt +template for the interested information in the CT reports. Then, we generate +the prompts by combining the prompt template with the CT reports as the inputs +of ChatGPT to obtain the responses. A post-processing module is developed to +transform the responses into structured extraction results. We conducted the +experiments with 847 CT reports collected from Peking University Cancer +Hospital. The experimental results indicate that ChatGPT can achieve +competitive performances for some extraction tasks compared with the baseline +information extraction system, but some limitations need to be further +improved. + +
+
+
+
+
+ + ☆ ReOnto: A Neuro-Symbolic Approach for Biomedical Relation Extraction ECML 2023 + + +
+ Relation Extraction (RE) is the task of extracting semantic relationships +between entities in a sentence and aligning them to relations defined in a +vocabulary, which is generally in the form of a Knowledge Graph (KG) or an +ontology. Various approaches have been proposed so far to address this task. +However, applying these techniques to biomedical text often yields +unsatisfactory results because it is hard to infer relations directly from +sentences due to the nature of the biomedical relations. To address these +issues, we present a novel technique called ReOnto, that makes use of neuro +symbolic knowledge for the RE task. ReOnto employs a graph neural network to +acquire the sentence representation and leverages publicly accessible +ontologies as prior knowledge to identify the sentential relation between two +entities. The approach involves extracting the relation path between the two +entities from the ontology. We evaluate the effect of using symbolic knowledge +from ontologies with graph neural networks. Experimental results on two public +biomedical datasets, BioRel and ADE, show that our method outperforms all the +baselines (approximately by 3\%). + +
+
+ comment: Accepted in ECML 2023 +
+
+
+
+
+ + ☆ Self-driven Grounding: Large Language Model Agents with Automatical + Language-aligned Skill Learning + + +
+ Large language models (LLMs) show their powerful automatic reasoning and +planning capability with a wealth of semantic knowledge about the human world. +However, the grounding problem still hinders the applications of LLMs in the +real-world environment. Existing studies try to fine-tune the LLM or utilize +pre-defined behavior APIs to bridge the LLMs and the environment, which not +only costs huge human efforts to customize for every single task but also +weakens the generality strengths of LLMs. To autonomously ground the LLM onto +the environment, we proposed the Self-Driven Grounding (SDG) framework to +automatically and progressively ground the LLM with self-driven skill learning. +SDG first employs the LLM to propose the hypothesis of sub-goals to achieve +tasks and then verify the feasibility of the hypothesis via interacting with +the underlying environment. Once verified, SDG can then learn generalized +skills with the guidance of these successfully grounded subgoals. These skills +can be further utilized to accomplish more complex tasks which fail to pass the +verification phase. Verified in the famous instruction following task +set-BabyAI, SDG achieves comparable performance in the most challenging tasks +compared with imitation learning methods that cost millions of demonstrations, +proving the effectiveness of learned skills and showing the feasibility and +efficiency of our framework. + +
+
+
+
+
+ + ☆ UniSA: Unified Generative Framework for Sentiment Analysis ACM MM 2023 + + +
+ Sentiment analysis is a crucial task that aims to understand people's +emotional states and predict emotional categories based on multimodal +information. It consists of several subtasks, such as emotion recognition in +conversation (ERC), aspect-based sentiment analysis (ABSA), and multimodal +sentiment analysis (MSA). However, unifying all subtasks in sentiment analysis +presents numerous challenges, including modality alignment, unified +input/output forms, and dataset bias. To address these challenges, we propose a +Task-Specific Prompt method to jointly model subtasks and introduce a +multimodal generative framework called UniSA. Additionally, we organize the +benchmark datasets of main subtasks into a new Sentiment Analysis Evaluation +benchmark, SAEval. We design novel pre-training tasks and training methods to +enable the model to learn generic sentiment knowledge among subtasks to improve +the model's multimodal sentiment perception ability. Our experimental results +show that UniSA performs comparably to the state-of-the-art on all subtasks and +generalizes well to various subtasks in sentiment analysis. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ☆ Minimal Effective Theory for Phonotactic Memory: Capturing Local + Correlations due to Errors in Speech + + +
+ Spoken language evolves constrained by the economy of speech, which depends +on factors such as the structure of the human mouth. This gives rise to local +phonetic correlations in spoken words. Here we demonstrate that these local +correlations facilitate the learning of spoken words by reducing their +information content. We do this by constructing a locally-connected +tensor-network model, inspired by similar variational models used for many-body +physics, which exploits these local phonetic correlations to facilitate the +learning of spoken words. The model is therefore a minimal model of phonetic +memory, where "learning to pronounce" and "learning a word" are one and the +same. A consequence of which is the learned ability to produce new words which +are phonetically reasonable for the target language; as well as providing a +hierarchy of the most likely errors that could be produced during the action of +speech. We test our model against Latin and Turkish words. (The code is +available on GitHub.) + +
+
+ comment: 16 pages; 7 figs +
+
+
+
+
+ + ☆ Towards Foundational AI Models for Additive Manufacturing: Language + Models for G-Code Debugging, Manipulation, and Comprehension + + +
+ 3D printing or additive manufacturing is a revolutionary technology that +enables the creation of physical objects from digital models. However, the +quality and accuracy of 3D printing depend on the correctness and efficiency of +the G-code, a low-level numerical control programming language that instructs +3D printers how to move and extrude material. Debugging G-code is a challenging +task that requires a syntactic and semantic understanding of the G-code format +and the geometry of the part to be printed. In this paper, we present the first +extensive evaluation of six state-of-the-art foundational large language models +(LLMs) for comprehending and debugging G-code files for 3D printing. We design +effective prompts to enable pre-trained LLMs to understand and manipulate +G-code and test their performance on various aspects of G-code debugging and +manipulation, including detection and correction of common errors and the +ability to perform geometric transformations. We analyze their strengths and +weaknesses for understanding complete G-code files. We also discuss the +implications and limitations of using LLMs for G-code comprehension. + +
+
+
+
+
+ + ☆ Text-Only Domain Adaptation for End-to-End Speech Recognition through + Down-Sampling Acoustic Representation INTERSPEECH 2023 + + +
+ Mapping two modalities, speech and text, into a shared representation space, +is a research topic of using text-only data to improve end-to-end automatic +speech recognition (ASR) performance in new domains. However, the length of +speech representation and text representation is inconsistent. Although the +previous method up-samples the text representation to align with acoustic +modality, it may not match the expected actual duration. In this paper, we +proposed novel representations match strategy through down-sampling acoustic +representation to align with text modality. By introducing a continuous +integrate-and-fire (CIF) module generating acoustic representations consistent +with token length, our ASR model can learn unified representations from both +modalities better, allowing for domain adaptation using text-only data of the +target domain. Experiment results of new domain data demonstrate the +effectiveness of the proposed method. + +
+
+ comment: Accepted by INTERSPEECH 2023. arXiv admin note: text overlap with + arXiv:2309.01437 +
+
+
+
+
+ + ♻ ☆ Construction Grammar and Language Models + + +
+ Recent progress in deep learning and natural language processing has given +rise to powerful models that are primarily trained on a cloze-like task and +show some evidence of having access to substantial linguistic information, +including some constructional knowledge. This groundbreaking discovery presents +an exciting opportunity for a synergistic relationship between computational +methods and Construction Grammar research. In this chapter, we explore three +distinct approaches to the interplay between computational methods and +Construction Grammar: (i) computational methods for text analysis, (ii) +computational Construction Grammar, and (iii) deep learning models, with a +particular focus on language models. We touch upon the first two approaches as +a contextual foundation for the use of computational methods before providing +an accessible, yet comprehensive overview of deep learning models, which also +addresses reservations construction grammarians may have. Additionally, we +delve into experiments that explore the emergence of constructionally relevant +information within these models while also examining the aspects of +Construction Grammar that may pose challenges for these models. This chapter +aims to foster collaboration between researchers in the fields of natural +language processing and Construction Grammar. By doing so, we hope to pave the +way for new insights and advancements in both these fields. + +
+
+ comment: Accepted for publication in The Cambridge Handbook of Construction + Grammar, edited by Mirjam Fried and Kiki Nikiforidou. To appear in 2024 +
+
+
+
+
+ + ♻ ☆ ParaGuide: Guided Diffusion Paraphrasers for Plug-and-Play Textual Style + Transfer + + +
+ Textual style transfer is the task of transforming stylistic properties of +text while preserving meaning. Target "styles" can be defined in numerous ways, +ranging from single attributes (e.g, formality) to authorship (e.g, +Shakespeare). Previous unsupervised style-transfer approaches generally rely on +significant amounts of labeled data for only a fixed set of styles or require +large language models. In contrast, we introduce a novel diffusion-based +framework for general-purpose style transfer that can be flexibly adapted to +arbitrary target styles at inference time. Our parameter-efficient approach, +ParaGuide, leverages paraphrase-conditioned diffusion models alongside +gradient-based guidance from both off-the-shelf classifiers and strong existing +style embedders to transform the style of text while preserving semantic +information. We validate the method on the Enron Email Corpus, with both human +and automatic evaluations, and find that it outperforms strong baselines on +formality, sentiment, and even authorship style transfer. + +
+
+
+
+
+ + ♻ ☆ Baseline Defenses for Adversarial Attacks Against Aligned Language + Models + + +
+ As Large Language Models quickly become ubiquitous, it becomes critical to +understand their security vulnerabilities. Recent work shows that text +optimizers can produce jailbreaking prompts that bypass moderation and +alignment. Drawing from the rich body of work on adversarial machine learning, +we approach these attacks with three questions: What threat models are +practically useful in this domain? How do baseline defense techniques perform +in this new domain? How does LLM security differ from computer vision? + We evaluate several baseline defense strategies against leading adversarial +attacks on LLMs, discussing the various settings in which each is feasible and +effective. Particularly, we look at three types of defenses: detection +(perplexity based), input preprocessing (paraphrase and retokenization), and +adversarial training. We discuss white-box and gray-box settings and discuss +the robustness-performance trade-off for each of the defenses considered. We +find that the weakness of existing discrete optimizers for text, combined with +the relatively high costs of optimization, makes standard adaptive attacks more +challenging for LLMs. Future research will be needed to uncover whether more +powerful optimizers can be developed, or whether the strength of filtering and +preprocessing defenses is greater in the LLMs domain than it has been in +computer vision. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ TouchStone: Evaluating Vision-Language Models by Language Models + + +
+ Large vision-language models (LVLMs) have recently witnessed rapid +advancements, exhibiting a remarkable capacity for perceiving, understanding, +and processing visual information by connecting visual receptor with large +language models (LLMs). However, current assessments mainly focus on +recognizing and reasoning abilities, lacking direct evaluation of +conversational skills and neglecting visual storytelling abilities. In this +paper, we propose an evaluation method that uses strong LLMs as judges to +comprehensively evaluate the various abilities of LVLMs. Firstly, we construct +a comprehensive visual dialogue dataset TouchStone, consisting of open-world +images and questions, covering five major categories of abilities and 27 +subtasks. This dataset not only covers fundamental recognition and +comprehension but also extends to literary creation. Secondly, by integrating +detailed image annotations we effectively transform the multimodal input +content into a form understandable by LLMs. This enables us to employ advanced +LLMs for directly evaluating the quality of the multimodal dialogue without +requiring human intervention. Through validation, we demonstrate that powerful +LVLMs, such as GPT-4, can effectively score dialogue quality by leveraging +their textual capabilities alone, aligning with human preferences. We hope our +work can serve as a touchstone for LVLMs' evaluation and pave the way for +building stronger LVLMs. The evaluation code is available at +https://github.com/OFA-Sys/TouchStone. + +
+
+ comment: https://github.com/OFA-Sys/TouchStone +
+
+
+
+
+ + ♻ ☆ How trial-to-trial learning shapes mappings in the mental lexicon: + Modelling Lexical Decision with Linear Discriminative Learning + + +
+ Trial-to-trial effects have been found in a number of studies, indicating +that processing a stimulus influences responses in subsequent trials. A special +case are priming effects which have been modelled successfully with +error-driven learning (Marsolek, 2008), implying that participants are +continuously learning during experiments. This study investigates whether +trial-to-trial learning can be detected in an unprimed lexical decision +experiment. We used the Discriminative Lexicon Model (DLM; Baayen et al., +2019), a model of the mental lexicon with meaning representations from +distributional semantics, which models error-driven incremental learning with +the Widrow-Hoff rule. We used data from the British Lexicon Project (BLP; +Keuleers et al., 2012) and simulated the lexical decision experiment with the +DLM on a trial-by-trial basis for each subject individually. Then, reaction +times were predicted with Generalised Additive Models (GAMs), using measures +derived from the DLM simulations as predictors. We extracted measures from two +simulations per subject (one with learning updates between trials and one +without), and used them as input to two GAMs. Learning-based models showed +better model fit than the non-learning ones for the majority of subjects. Our +measures also provide insights into lexical processing and individual +differences. This demonstrates the potential of the DLM to model behavioural +data and leads to the conclusion that trial-to-trial learning can indeed be +detected in unprimed lexical decision. Our results support the possibility that +our lexical knowledge is subject to continuous changes. + +
+
+ comment: 48 pages, 13 figures; revised version +
+
+
+
+
+ + ♻ ☆ OUTFOX: LLM-generated Essay Detection through In-context Learning with + Adversarially Generated Examples + + +
+ Large Language Models (LLMs) have achieved human-level fluency in text +generation, making it difficult to distinguish between human-written and +LLM-generated texts. This poses a growing risk of misuse of LLMs and demands +the development of detectors to identify LLM-generated texts. However, existing +detectors lack robustness against attacks: they degrade detection accuracy by +simply paraphrasing LLM-generated texts. Furthermore, a malicious user might +attempt to deliberately evade the detectors based on detection results, but +this has not been assumed in previous studies. In this paper, we propose +OUTFOX, a framework that improves the robustness of LLM-generated-text +detectors by allowing both the detector and the attacker to consider each +other's output. In this framework, the attacker uses the detector's prediction +labels as examples for in-context learning and adversarially generates essays +that are harder to detect, while the detector uses the adversarially generated +essays as examples for in-context learning to learn to detect essays from a +strong attacker. Experiments in the domain of student essays show that the +proposed detector improves the detection performance on the attacker-generated +texts by up to +41.3 points in F1-score. Furthermore, the proposed detector +shows a state-of-the-art detection performance: up to 96.9 points in F1-score, +beating existing detectors on non-attacked texts. Finally, the proposed +attacker drastically degrades the performance of detectors by up to -57.0 +points F1-score, massively outperforming the baseline paraphrasing method for +evading detection. + +
+
+
+
+
+ + ♻ ☆ Evidence of Human-Like Visual-Linguistic Integration in Multimodal Large + Language Models During Predictive Language Processing + + +
+ The advanced language processing abilities of large language models (LLMs) +have stimulated debate over their capacity to replicate human-like cognitive +processes. One differentiating factor between language processing in LLMs and +humans is that language input is often grounded in several perceptual +modalities, whereas most LLMs process solely text-based information. Multimodal +grounding allows humans to integrate - e.g. visual context with linguistic +information and thereby place constraints on the space of upcoming words, +reducing cognitive load and improving comprehension. Recent multimodal LLMs +(mLLMs) combine a visual-linguistic embedding space with a transformer type +attention mechanism for next-word prediction. Here we ask whether predictive +language processing based on multimodal input in mLLMs aligns with humans. +Two-hundred participants watched short audio-visual clips and estimated +predictability of an upcoming verb or noun. The same clips were processed by +the mLLM CLIP, with predictability scores based on comparing image and text +feature vectors. Eye-tracking was used to estimate what visual features +participants attended to, and CLIP's visual attention weights were recorded. We +find that alignment of predictability scores was driven by multimodality of +CLIP (no alignment for a unimodal state-of-the-art LLM) and by the attention +mechanism (no alignment when attention weights were perturbated or when the +same input was fed to a multimodal model without attention). We further find a +significant spatial overlap between CLIP's visual attention weights and human +eye-tracking data. Results suggest that comparable processes of integrating +multimodal information, guided by attention to relevant visual features, +supports predictive language processing in mLLMs and humans. + +
+
+ comment: 13 pages, 4 figures, submitted to journal +
+
+
+
+
+ + ♻ ☆ BiasTestGPT: Using ChatGPT for Social Bias Testing of Language Models + + +
+ Pretrained Language Models (PLMs) harbor inherent social biases that can +result in harmful real-world implications. Such social biases are measured +through the probability values that PLMs output for different social groups and +attributes appearing in a set of test sentences. However, bias testing is +currently cumbersome since the test sentences are generated either from a +limited set of manual templates or need expensive crowd-sourcing. We instead +propose using ChatGPT for controllable generation of test sentences, given any +arbitrary user-specified combination of social groups and attributes appearing +in the test sentences. When compared to template-based methods, our approach +using ChatGPT for test sentence generation is superior in detecting social +bias, especially in challenging settings such as intersectional biases. We +present an open-source comprehensive bias testing framework (BiasTestGPT), +hosted on HuggingFace, that can be plugged into any open-source PLM for bias +testing. We provide a large diverse dataset of test sentences generated by +ChatGPT that satisfies the specified social group and attribute requirements +and matches the quality of human-generated sentences. We thus enable seamless +open-ended social bias testing of PLMs through an automatic large-scale +generation of diverse test sentences for any combination of social categories +and attributes. + +
+
+
+
+
+ + ♻ ☆ From Instructions to Intrinsic Human Values -- A Survey of Alignment + Goals for Big Models + + +
+ Big models, exemplified by Large Language Models (LLMs), are models typically +pre-trained on massive data and comprised of enormous parameters, which not +only obtain significantly improved performance across diverse tasks but also +present emergent capabilities absent in smaller models. However, the growing +intertwining of big models with everyday human lives poses potential risks and +might cause serious social harm. Therefore, many efforts have been made to +align LLMs with humans to make them better follow user instructions and satisfy +human preferences. Nevertheless, `what to align with' has not been fully +discussed, and inappropriate alignment goals might even backfire. In this +paper, we conduct a comprehensive survey of different alignment goals in +existing work and trace their evolution paths to help identify the most +essential goal. Particularly, we investigate related works from two +perspectives: the definition of alignment goals and alignment evaluation. Our +analysis encompasses three distinct levels of alignment goals and reveals a +goal transformation from fundamental abilities to value orientation, indicating +the potential of intrinsic human values as the alignment goal for enhanced +LLMs. Based on such results, we further discuss the challenges of achieving +such intrinsic value alignment and provide a collection of available resources +for future research on the alignment of big models. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Do-Not-Answer: A Dataset for Evaluating Safeguards in LLMs + + +
+ With the rapid evolution of large language models (LLMs), new and +hard-to-predict harmful capabilities are emerging. This requires developers to +be able to identify risks through the evaluation of "dangerous capabilities" in +order to responsibly deploy LLMs. In this work, we collect the first +open-source dataset to evaluate safeguards in LLMs, and deploy safer +open-source LLMs at a low cost. Our dataset is curated and filtered to consist +only of instructions that responsible language models should not follow. We +annotate and assess the responses of six popular LLMs to these instructions. +Based on our annotation, we proceed to train several BERT-like classifiers, and +find that these small classifiers can achieve results that are comparable with +GPT-4 on automatic safety evaluation. Warning: this paper contains example data +that may be offensive, harmful, or biased. + +
+
+ comment: 18 pages, 9 figures, 11 tables +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 45 + +
+
+
+ + ☆ Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition + and Translation + + +
+ In this paper, we devise a mechanism for the addition of multi-modal +information with an existing pipeline for continuous sign language recognition +and translation. In our procedure, we have incorporated optical flow +information with RGB images to enrich the features with movement-related +information. This work studies the feasibility of such modality inclusion using +a cross-modal encoder. The plugin we have used is very lightweight and doesn't +need to include a separate feature extractor for the new modality in an +end-to-end manner. We have applied the changes in both sign language +recognition and translation, improving the result in each case. We have +evaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language +recognition and the RWTH-PHOENIX-2014T dataset for translation. On the +recognition task, our approach reduced the WER by 0.9, and on the translation +task, our approach increased most of the BLEU scores by ~0.6 on the test set. + +
+
+
+
+
+ + ☆ NLLB-CLIP -- train performant multilingual image retrieval model on a + budget + + +
+ Today, the exponential rise of large models developed by academic and +industrial institutions with the help of massive computing resources raises the +question of whether someone without access to such resources can make a +valuable scientific contribution. To explore this, we tried to solve the +challenging task of multilingual image retrieval having a limited budget of +$1,000. As a result, we present NLLB-CLIP - CLIP model with a text encoder from +the NLLB model. To train the model, we used an automatically created dataset of +106,246 good-quality images with captions in 201 languages derived from the +LAION COCO dataset. We trained multiple models using image and text encoders of +various sizes and kept different parts of the model frozen during the training. +We thoroughly analyzed the trained models using existing evaluation datasets +and newly created XTD200 and Flickr30k-200 datasets. We show that NLLB-CLIP is +comparable in quality to state-of-the-art models and significantly outperforms +them on low-resource languages. + +
+
+
+
+
+ + ☆ Towards Universal Image Embeddings: A Large-Scale Dataset and Challenge + for Generic Image Representations ICCV 2023 + + +
+ Fine-grained and instance-level recognition methods are commonly trained and +evaluated on specific domains, in a model per domain scenario. Such an +approach, however, is impractical in real large-scale applications. In this +work, we address the problem of universal image embedding, where a single +universal model is trained and used in multiple domains. First, we leverage +existing domain-specific datasets to carefully construct a new large-scale +public benchmark for the evaluation of universal image embeddings, with 241k +query images, 1.4M index images and 2.8M training images across 8 different +domains and 349k classes. We define suitable metrics, training and evaluation +protocols to foster future research in this area. Second, we provide a +comprehensive experimental evaluation on the new dataset, demonstrating that +existing approaches and simplistic extensions lead to worse performance than an +assembly of models trained for each domain separately. Finally, we conducted a +public research competition on this topic, leveraging industrial datasets, +which attracted the participation of more than 1k teams worldwide. This +exercise generated many interesting research ideas and findings which we +present in detail. Project webpage: https://cmp.felk.cvut.cz/univ_emb/ + +
+
+ comment: ICCV 2023 Accepted +
+
+
+
+
+ + ☆ SMPLitex: A Generative Model and Dataset for 3D Human Texture Estimation + from Single Image BMVC 2023 + + +
+ We propose SMPLitex, a method for estimating and manipulating the complete 3D +appearance of humans captured from a single image. SMPLitex builds upon the +recently proposed generative models for 2D images, and extends their use to the +3D domain through pixel-to-surface correspondences computed on the input image. +To this end, we first train a generative model for complete 3D human +appearance, and then fit it into the input image by conditioning the generative +model to the visible parts of the subject. Furthermore, we propose a new +dataset of high-quality human textures built by sampling SMPLitex conditioned +on subject descriptions and images. We quantitatively and qualitatively +evaluate our method in 3 publicly available datasets, demonstrating that +SMPLitex significantly outperforms existing methods for human texture +estimation while allowing for a wider variety of tasks such as editing, +synthesis, and manipulation + +
+
+ comment: Accepted at BMVC 2023. Project website: + https://dancasas.github.io/projects/SMPLitex +
+
+
+
+
+ + ☆ Uncertainty in AI: Evaluating Deep Neural Networks on + Out-of-Distribution Images + + +
+ As AI models are increasingly deployed in critical applications, ensuring the +consistent performance of models when exposed to unusual situations such as +out-of-distribution (OOD) or perturbed data, is important. Therefore, this +paper investigates the uncertainty of various deep neural networks, including +ResNet-50, VGG16, DenseNet121, AlexNet, and GoogleNet, when dealing with such +data. Our approach includes three experiments. First, we used the pretrained +models to classify OOD images generated via DALL-E to assess their performance. +Second, we built an ensemble from the models' predictions using probabilistic +averaging for consensus due to its advantages over plurality or majority +voting. The ensemble's uncertainty was quantified using average probabilities, +variance, and entropy metrics. Our results showed that while ResNet-50 was the +most accurate single model for OOD images, the ensemble performed even better, +correctly classifying all images. Third, we tested model robustness by adding +perturbations (filters, rotations, etc.) to new epistemic images from DALL-E or +real-world captures. ResNet-50 was chosen for this being the best performing +model. While it classified 4 out of 5 unperturbed images correctly, it +misclassified all of them post-perturbation, indicating a significant +vulnerability. These misclassifications, which are clear to human observers, +highlight AI models' limitations. Using saliency maps, we identified regions of +the images that the model considered important for their decisions. + +
+
+
+
+
+ + ☆ StereoFlowGAN: Co-training for Stereo and Flow with Unsupervised Domain + Adaptation BMVC 2023 + + +
+ We introduce a novel training strategy for stereo matching and optical flow +estimation that utilizes image-to-image translation between synthetic and real +image domains. Our approach enables the training of models that excel in real +image scenarios while relying solely on ground-truth information from synthetic +images. To facilitate task-agnostic domain adaptation and the training of +task-specific components, we introduce a bidirectional feature warping module +that handles both left-right and forward-backward directions. Experimental +results show competitive performance over previous domain translation-based +methods, which substantiate the efficacy of our proposed framework, effectively +leveraging the benefits of unsupervised domain adaptation, stereo matching, and +optical flow estimation. + +
+
+ comment: Accepted by BMVC 2023 +
+
+
+
+
+ + ☆ On the fly Deep Neural Network Optimization Control for Low-Power + Computer Vision + + +
+ Processing visual data on mobile devices has many applications, e.g., +emergency response and tracking. State-of-the-art computer vision techniques +rely on large Deep Neural Networks (DNNs) that are usually too power-hungry to +be deployed on resource-constrained edge devices. Many techniques improve the +efficiency of DNNs by using sparsity or quantization. However, the accuracy and +efficiency of these techniques cannot be adapted for diverse edge applications +with different hardware constraints and accuracy requirements. This paper +presents a novel technique to allow DNNs to adapt their accuracy and energy +consumption during run-time, without the need for any re-training. Our +technique called AdaptiveActivation introduces a hyper-parameter that controls +the output range of the DNNs' activation function to dynamically adjust the +sparsity and precision in the DNN. AdaptiveActivation can be applied to any +existing pre-trained DNN to improve their deployability in diverse edge +environments. We conduct experiments on popular edge devices and show that the +accuracy is within 1.5% of the baseline. We also show that our approach +requires 10%--38% less memory than the baseline techniques leading to more +accuracy-efficiency tradeoff options + +
+
+
+
+
+ + ☆ Multi-dimension unified Swin Transformer for 3D Lesion Segmentation in + Multiple Anatomical Locations + + +
+ In oncology research, accurate 3D segmentation of lesions from CT scans is +essential for the modeling of lesion growth kinetics. However, following the +RECIST criteria, radiologists routinely only delineate each lesion on the axial +slice showing the largest transverse area, and delineate a small number of +lesions in 3D for research purposes. As a result, we have plenty of unlabeled +3D volumes and labeled 2D images, and scarce labeled 3D volumes, which makes +training a deep-learning 3D segmentation model a challenging task. In this +work, we propose a novel model, denoted a multi-dimension unified Swin +transformer (MDU-ST), for 3D lesion segmentation. The MDU-ST consists of a +Shifted-window transformer (Swin-transformer) encoder and a convolutional +neural network (CNN) decoder, allowing it to adapt to 2D and 3D inputs and +learn the corresponding semantic information in the same encoder. Based on this +model, we introduce a three-stage framework: 1) leveraging large amount of +unlabeled 3D lesion volumes through self-supervised pretext tasks to learn the +underlying pattern of lesion anatomy in the Swin-transformer encoder; 2) +fine-tune the Swin-transformer encoder to perform 2D lesion segmentation with +2D RECIST slices to learn slice-level segmentation information; 3) further +fine-tune the Swin-transformer encoder to perform 3D lesion segmentation with +labeled 3D volumes. The network's performance is evaluated by the Dice +similarity coefficient (DSC) and Hausdorff distance (HD) using an internal 3D +lesion dataset with 593 lesions extracted from multiple anatomical locations. +The proposed MDU-ST demonstrates significant improvement over the competing +models. The proposed method can be used to conduct automated 3D lesion +segmentation to assist radiomics and tumor growth modeling studies. This paper +has been accepted by the IEEE International Symposium on Biomedical Imaging +(ISBI) 2023. + +
+
+
+
+
+ + ☆ Instant Continual Learning of Neural Radiance Fields + + +
+ Neural radiance fields (NeRFs) have emerged as an effective method for +novel-view synthesis and 3D scene reconstruction. However, conventional +training methods require access to all training views during scene +optimization. This assumption may be prohibitive in continual learning +scenarios, where new data is acquired in a sequential manner and a continuous +update of the NeRF is desired, as in automotive or remote sensing applications. +When naively trained in such a continual setting, traditional scene +representation frameworks suffer from catastrophic forgetting, where previously +learned knowledge is corrupted after training on new data. Prior works in +alleviating forgetting with NeRFs suffer from low reconstruction quality and +high latency, making them impractical for real-world application. We propose a +continual learning framework for training NeRFs that leverages replay-based +methods combined with a hybrid explicit--implicit scene representation. Our +method outperforms previous methods in reconstruction quality when trained in a +continual setting, while having the additional benefit of being an order of +magnitude faster. + +
+
+ comment: For project page +
+
+
+
+
+ + ☆ Accuracy and Consistency of Space-based Vegetation Height Maps for + Forest Dynamics in Alpine Terrain + + +
+ Monitoring and understanding forest dynamics is essential for environmental +conservation and management. This is why the Swiss National Forest Inventory +(NFI) provides countrywide vegetation height maps at a spatial resolution of +0.5 m. Its long update time of 6 years, however, limits the temporal analysis +of forest dynamics. This can be improved by using spaceborne remote sensing and +deep learning to generate large-scale vegetation height maps in a +cost-effective way. In this paper, we present an in-depth analysis of these +methods for operational application in Switzerland. We generate annual, +countrywide vegetation height maps at a 10-meter ground sampling distance for +the years 2017 to 2020 based on Sentinel-2 satellite imagery. In comparison to +previous works, we conduct a large-scale and detailed stratified analysis +against a precise Airborne Laser Scanning reference dataset. This stratified +analysis reveals a close relationship between the model accuracy and the +topology, especially slope and aspect. We assess the potential of deep +learning-derived height maps for change detection and find that these maps can +indicate changes as small as 250 $m^2$. Larger-scale changes caused by a winter +storm are detected with an F1-score of 0.77. Our results demonstrate that +vegetation height maps computed from satellite imagery with deep learning are a +valuable, complementary, cost-effective source of evidence to increase the +temporal resolution for national forest assessments. + +
+
+
+
+
+ + ☆ Neural-Singular-Hessian: Implicit Neural Representation of Unoriented + Point Clouds by Enforcing Singular Hessian + + +
+ Neural implicit representation is a promising approach for reconstructing +surfaces from point clouds. Existing methods combine various regularization +terms, such as the Eikonal and Laplacian energy terms, to enforce the learned +neural function to possess the properties of a Signed Distance Function (SDF). +However, inferring the actual topology and geometry of the underlying surface +from poor-quality unoriented point clouds remains challenging. In accordance +with Differential Geometry, the Hessian of the SDF is singular for points +within the differential thin-shell space surrounding the surface. Our approach +enforces the Hessian of the neural implicit function to have a zero determinant +for points near the surface. This technique aligns the gradients for a +near-surface point and its on-surface projection point, producing a rough but +faithful shape within just a few iterations. By annealing the weight of the +singular-Hessian term, our approach ultimately produces a high-fidelity +reconstruction result. Extensive experimental results demonstrate that our +approach effectively suppresses ghost geometry and recovers details from +unoriented point clouds with better expressiveness than existing fitting-based +methods. + +
+
+
+
+
+ + ☆ Safe and Robust Watermark Injection with a Single OoD Image + + +
+ Training a high-performance deep neural network requires large amounts of +data and computational resources. Protecting the intellectual property (IP) and +commercial ownership of a deep model is challenging yet increasingly crucial. A +major stream of watermarking strategies implants verifiable backdoor triggers +by poisoning training samples, but these are often unrealistic due to data +privacy and safety concerns and are vulnerable to minor model changes such as +fine-tuning. To overcome these challenges, we propose a safe and robust +backdoor-based watermark injection technique that leverages the diverse +knowledge from a single out-of-distribution (OoD) image, which serves as a +secret key for IP verification. The independence of training data makes it +agnostic to third-party promises of IP security. We induce robustness via +random perturbation of model parameters during watermark injection to defend +against common watermark removal attacks, including fine-tuning, pruning, and +model extraction. Our experimental results demonstrate that the proposed +watermarking approach is not only time- and sample-efficient without training +data, but also robust against the watermark removal attacks above. + +
+
+
+
+
+ + ☆ 3D View Prediction Models of the Dorsal Visual Stream + + +
+ Deep neural network representations align well with brain activity in the +ventral visual stream. However, the primate visual system has a distinct dorsal +processing stream with different functional properties. To test if a model +trained to perceive 3D scene geometry aligns better with neural responses in +dorsal visual areas, we trained a self-supervised geometry-aware recurrent +neural network (GRNN) to predict novel camera views using a 3D feature memory. +We compared GRNN to self-supervised baseline models that have been shown to +align well with ventral regions using the large-scale fMRI Natural Scenes +Dataset (NSD). We found that while the baseline models accounted better for +ventral brain regions, GRNN accounted for a greater proportion of variance in +dorsal brain regions. Our findings demonstrate the potential for using +task-relevant models to probe representational differences across visual +streams. + +
+
+ comment: 2023 Conference on Cognitive Computational Neuroscience +
+
+
+
+
+ + ☆ StyleAdapter: A Single-Pass LoRA-Free Model for Stylized Image + Generation + + +
+ This paper presents a LoRA-free method for stylized image generation that +takes a text prompt and style reference images as inputs and produces an output +image in a single pass. Unlike existing methods that rely on training a +separate LoRA for each style, our method can adapt to various styles with a +unified model. However, this poses two challenges: 1) the prompt loses +controllability over the generated content, and 2) the output image inherits +both the semantic and style features of the style reference image, compromising +its content fidelity. To address these challenges, we introduce StyleAdapter, a +model that comprises two components: a two-path cross-attention module (TPCA) +and three decoupling strategies. These components enable our model to process +the prompt and style reference features separately and reduce the strong +coupling between the semantic and style information in the style references. +StyleAdapter can generate high-quality images that match the content of the +prompts and adopt the style of the references (even for unseen styles) in a +single pass, which is more flexible and efficient than previous methods. +Experiments have been conducted to demonstrate the superiority of our method +over previous works. + +
+
+ comment: AIGC +
+
+
+
+
+ + ☆ BLiSS: Bootstrapped Linear Shape Space + + +
+ Morphable models are fundamental to numerous human-centered processes as they +offer a simple yet expressive shape space. Creating such morphable models, +however, is both tedious and expensive. The main challenge is establishing +dense correspondences across raw scans that capture sufficient shape variation. +This is often addressed using a mix of significant manual intervention and +non-rigid registration. We observe that creating a shape space and solving for +dense correspondence are tightly coupled -- while dense correspondence is +needed to build shape spaces, an expressive shape space provides a reduced +dimensional space to regularize the search. We introduce BLiSS, a method to +solve both progressively. Starting from a small set of manually registered +scans to bootstrap the process, we enrich the shape space and then use that to +get new unregistered scans into correspondence automatically. The critical +component of BLiSS is a non-linear deformation model that captures details +missed by the low-dimensional shape space, thus allowing progressive enrichment +of the space. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ Multispectral Indices for Wildfire Management + + +
+ This paper highlights and summarizes the most important multispectral indices +and associated methodologies for fire management. Various fields of study are +examined where multispectral indices align with wildfire prevention and +management, including vegetation and soil attribute extraction, water feature +mapping, artificial structure identification, and post-fire burnt area +estimation. The versatility and effectiveness of multispectral indices in +addressing specific issues in wildfire management are emphasized. Fundamental +insights for optimizing data extraction are presented. Concrete indices for +each task, including the NDVI and the NDWI, are suggested. Moreover, to enhance +accuracy and address inherent limitations of individual index applications, the +integration of complementary processing solutions and additional data sources +like high-resolution imagery and ground-based measurements is recommended. This +paper aims to be an immediate and comprehensive reference for researchers and +stakeholders working on multispectral indices related to the prevention and +management of fires. + +
+
+
+
+
+ + ☆ An Empirical Analysis for Zero-Shot Multi-Label Classification on + COVID-19 CT Scans and Uncurated Reports ICCV + + +
+ The pandemic resulted in vast repositories of unstructured data, including +radiology reports, due to increased medical examinations. Previous research on +automated diagnosis of COVID-19 primarily focuses on X-ray images, despite +their lower precision compared to computed tomography (CT) scans. In this work, +we leverage unstructured data from a hospital and harness the fine-grained +details offered by CT scans to perform zero-shot multi-label classification +based on contrastive visual language learning. In collaboration with human +experts, we investigate the effectiveness of multiple zero-shot models that aid +radiologists in detecting pulmonary embolisms and identifying intricate lung +details like ground glass opacities and consolidations. Our empirical analysis +provides an overview of the possible solutions to target such fine-grained +tasks, so far overlooked in the medical multimodal pretraining literature. Our +investigation promises future advancements in the medical image analysis +community by addressing some challenges associated with unstructured data and +fine-grained multi-label classification. + +
+
+ comment: 10 pages, 3 figures, Proceedings of the IEEE/CVF International + Conference on Computer Vision (ICCV) Workshops 2023 +
+
+
+
+
+ + ☆ Softmax Bias Correction for Quantized Generative Models + + +
+ Post-training quantization (PTQ) is the go-to compression technique for large +generative models, such as stable diffusion or large language models. PTQ +methods commonly keep the softmax activation in higher precision as it has been +shown to be very sensitive to quantization noise. However, this can lead to a +significant runtime and power overhead during inference on resource-constraint +edge devices. In this work, we investigate the source of the softmax +sensitivity to quantization and show that the quantization operation leads to a +large bias in the softmax output, causing accuracy degradation. To overcome +this issue, we propose an offline bias correction technique that improves the +quantizability of softmax without additional compute during deployment, as it +can be readily absorbed into the quantization parameters. We demonstrate the +effectiveness of our method on stable diffusion v1.5 and 125M-size OPT language +model, achieving significant accuracy improvement for 8-bit quantized softmax. + +
+
+
+
+
+ + ☆ Generative-based Fusion Mechanism for Multi-Modal Tracking + + +
+ Generative models (GMs) have received increasing research interest for their +remarkable capacity to achieve comprehensive understanding. However, their +potential application in the domain of multi-modal tracking has remained +relatively unexplored. In this context, we seek to uncover the potential of +harnessing generative techniques to address the critical challenge, information +fusion, in multi-modal tracking. In this paper, we delve into two prominent GM +techniques, namely, Conditional Generative Adversarial Networks (CGANs) and +Diffusion Models (DMs). Different from the standard fusion process where the +features from each modality are directly fed into the fusion block, we +condition these multi-modal features with random noise in the GM framework, +effectively transforming the original training samples into harder instances. +This design excels at extracting discriminative clues from the features, +enhancing the ultimate tracking performance. To quantitatively gauge the +effectiveness of our approach, we conduct extensive experiments across two +multi-modal tracking tasks, three baseline methods, and three challenging +benchmarks. The experimental results demonstrate that the proposed +generative-based fusion mechanism achieves state-of-the-art performance, +setting new records on LasHeR and RGBD1K. + +
+
+ comment: 10 figures, 8 tables +
+
+
+
+
+ + ☆ SAF-IS: a Spatial Annotation Free Framework for Instance Segmentation of + Surgical Tools + + +
+ Instance segmentation of surgical instruments is a long-standing research +problem, crucial for the development of many applications for computer-assisted +surgery. This problem is commonly tackled via fully-supervised training of deep +learning models, requiring expensive pixel-level annotations to train. In this +work, we develop a framework for instance segmentation not relying on spatial +annotations for training. Instead, our solution only requires binary tool +masks, obtainable using recent unsupervised approaches, and binary tool +presence labels, freely obtainable in robot-assisted surgery. Based on the +binary mask information, our solution learns to extract individual tool +instances from single frames, and to encode each instance into a compact vector +representation, capturing its semantic features. Such representations guide the +automatic selection of a tiny number of instances (8 only in our experiments), +displayed to a human operator for tool-type labelling. The gathered information +is finally used to match each training instance with a binary tool presence +label, providing an effective supervision signal to train a tool instance +classifier. We validate our framework on the EndoVis 2017 and 2018 segmentation +datasets. We provide results using binary masks obtained either by manual +annotation or as predictions of an unsupervised binary segmentation model. The +latter solution yields an instance segmentation approach completely free from +spatial annotations, outperforming several state-of-the-art fully-supervised +segmentation approaches. + +
+
+
+
+
+ + ☆ ControlMat: A Controlled Generative Approach to Material Capture + + +
+ Material reconstruction from a photograph is a key component of 3D content +creation democratization. We propose to formulate this ill-posed problem as a +controlled synthesis one, leveraging the recent progress in generative deep +networks. We present ControlMat, a method which, given a single photograph with +uncontrolled illumination as input, conditions a diffusion model to generate +plausible, tileable, high-resolution physically-based digital materials. We +carefully analyze the behavior of diffusion models for multi-channel outputs, +adapt the sampling process to fuse multi-scale information and introduce rolled +diffusion to enable both tileability and patched diffusion for high-resolution +outputs. Our generative approach further permits exploration of a variety of +materials which could correspond to the input image, mitigating the unknown +lighting conditions. We show that our approach outperforms recent inference and +latent-space-optimization methods, and carefully validate our diffusion process +design choices. Supplemental materials and additional details are available at: +https://gvecchio.com/controlmat/. + +
+
+
+
+
+ + ☆ No Data Augmentation? Alternative Regularizations for Effective Training + on Small Datasets ICCV + + +
+ Solving image classification tasks given small training datasets remains an +open challenge for modern computer vision. Aggressive data augmentation and +generative models are among the most straightforward approaches to overcoming +the lack of data. However, the first fails to be agnostic to varying image +domains, while the latter requires additional compute and careful design. In +this work, we study alternative regularization strategies to push the limits of +supervised learning on small image classification datasets. In particular, +along with the model size and training schedule scaling, we employ a heuristic +to select (semi) optimal learning rate and weight decay couples via the norm of +model parameters. By training on only 1% of the original CIFAR-10 training set +(i.e., 50 images per class) and testing on ciFAIR-10, a variant of the original +CIFAR without duplicated images, we reach a test accuracy of 66.5%, on par with +the best state-of-the-art methods. + +
+
+ comment: 4th Visual Inductive Priors for Data-Efficient Deep Learning + Workshop, ICCVW 2023 +
+
+
+
+
+ + ☆ Mask-Attention-Free Transformer for 3D Instance Segmentation ICCV 2023 + + +
+ Recently, transformer-based methods have dominated 3D instance segmentation, +where mask attention is commonly involved. Specifically, object queries are +guided by the initial instance masks in the first cross-attention, and then +iteratively refine themselves in a similar manner. However, we observe that the +mask-attention pipeline usually leads to slow convergence due to low-recall +initial instance masks. Therefore, we abandon the mask attention design and +resort to an auxiliary center regression task instead. Through center +regression, we effectively overcome the low-recall issue and perform +cross-attention by imposing positional prior. To reach this goal, we develop a +series of position-aware designs. First, we learn a spatial distribution of 3D +locations as the initial position queries. They spread over the 3D space +densely, and thus can easily capture the objects in a scene with a high recall. +Moreover, we present relative position encoding for the cross-attention and +iterative refinement for more accurate position queries. Experiments show that +our approach converges 4x faster than existing work, sets a new state of the +art on ScanNetv2 3D instance segmentation benchmark, and also demonstrates +superior performance across various datasets. Code and models are available at +https://github.com/dvlab-research/Mask-Attention-Free-Transformer. + +
+
+ comment: Accepted to ICCV 2023. Code and models are available at + https://github.com/dvlab-research/Mask-Attention-Free-Transformer +
+
+
+
+
+ + ☆ Prior Knowledge Guided Network for Video Anomaly Detection + + +
+ Video Anomaly Detection (VAD) involves detecting anomalous events in videos, +presenting a significant and intricate task within intelligent video +surveillance. Existing studies often concentrate solely on features acquired +from limited normal data, disregarding the latent prior knowledge present in +extensive natural image datasets. To address this constraint, we propose a +Prior Knowledge Guided Network(PKG-Net) for the VAD task. First, an +auto-encoder network is incorporated into a teacher-student architecture to +learn two designated proxy tasks: future frame prediction and teacher network +imitation, which can provide better generalization ability on unknown samples. +Second, knowledge distillation on proper feature blocks is also proposed to +increase the multi-scale detection ability of the model. In addition, +prediction error and teacher-student feature inconsistency are combined to +evaluate anomaly scores of inference samples more comprehensively. Experimental +results on three public benchmarks validate the effectiveness and accuracy of +our method, which surpasses recent state-of-the-arts. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Prompt me a Dataset: An investigation of text-image prompting for + historical image dataset creation using foundation models + + +
+ In this paper, we present a pipeline for image extraction from historical +documents using foundation models, and evaluate text-image prompts and their +effectiveness on humanities datasets of varying levels of complexity. The +motivation for this approach stems from the high interest of historians in +visual elements printed alongside historical texts on the one hand, and from +the relative lack of well-annotated datasets within the humanities when +compared to other domains. We propose a sequential approach that relies on +GroundDINO and Meta's Segment-Anything-Model (SAM) to retrieve a significant +portion of visual data from historical documents that can then be used for +downstream development tasks and dataset creation, as well as evaluate the +effect of different linguistic prompts on the resulting detections. + +
+
+ comment: 12 pages, 3 figures, Accepted in ICIAP2023, AI4DH workshop +
+
+
+
+
+ + ☆ Building Footprint Extraction in Dense Areas using Super Resolution and + Frame Field Learning + + +
+ Despite notable results on standard aerial datasets, current +state-of-the-arts fail to produce accurate building footprints in dense areas +due to challenging properties posed by these areas and limited data +availability. In this paper, we propose a framework to address such issues in +polygonal building extraction. First, super resolution is employed to enhance +the spatial resolution of aerial image, allowing for finer details to be +captured. This enhanced imagery serves as input to a multitask learning module, +which consists of a segmentation head and a frame field learning head to +effectively handle the irregular building structures. Our model is supervised +by adaptive loss weighting, enabling extraction of sharp edges and fine-grained +polygons which is difficult due to overlapping buildings and low data quality. +Extensive experiments on a slum area in India that mimics a dense area +demonstrate that our proposed approach significantly outperforms the current +state-of-the-art methods by a large margin. + +
+
+ comment: Accepted at The 12th International Conference on Awareness Science + and Technology +
+
+
+
+
+ + ☆ ReLoc-PDR: Visual Relocalization Enhanced Pedestrian Dead Reckoning via + Graph Optimization + + +
+ Accurately and reliably positioning pedestrians in satellite-denied +conditions remains a significant challenge. Pedestrian dead reckoning (PDR) is +commonly employed to estimate pedestrian location using low-cost inertial +sensor. However, PDR is susceptible to drift due to sensor noise, incorrect +step detection, and inaccurate stride length estimation. This work proposes +ReLoc-PDR, a fusion framework combining PDR and visual relocalization using +graph optimization. ReLoc-PDR leverages time-correlated visual observations and +learned descriptors to achieve robust positioning in visually-degraded +environments. A graph optimization-based fusion mechanism with the Tukey kernel +effectively corrects cumulative errors and mitigates the impact of abnormal +visual observations. Real-world experiments demonstrate that our ReLoc-PDR +surpasses representative methods in accuracy and robustness, achieving accurte +and robust pedestrian positioning results using only a smartphone in +challenging environments such as less-textured corridors and dark nighttime +scenarios. + +
+
+ comment: 11 pages, 14 figures +
+
+
+
+
+ + ☆ Cross-Consistent Deep Unfolding Network for Adaptive All-In-One Video + Restoration + + +
+ Existing Video Restoration (VR) methods always necessitate the individual +deployment of models for each adverse weather to remove diverse adverse weather +degradations, lacking the capability for adaptive processing of degradations. +Such limitation amplifies the complexity and deployment costs in practical +applications. To overcome this deficiency, in this paper, we propose a +Cross-consistent Deep Unfolding Network (CDUN) for All-In-One VR, which enables +the employment of a single model to remove diverse degradations for the first +time. Specifically, the proposed CDUN accomplishes a novel iterative +optimization framework, capable of restoring frames corrupted by corresponding +degradations according to the degradation features given in advance. To empower +the framework for eliminating diverse degradations, we devise a Sequence-wise +Adaptive Degradation Estimator (SADE) to estimate degradation features for the +input corrupted video. By orchestrating these two cascading procedures, CDUN +achieves adaptive processing for diverse degradation. In addition, we introduce +a window-based inter-frame fusion strategy to utilize information from more +adjacent frames. This strategy involves the progressive stacking of temporal +windows in multiple iterations, effectively enlarging the temporal receptive +field and enabling each frame's restoration to leverage information from +distant frames. Extensive experiments demonstrate that the proposed method +achieves state-of-the-art performance in All-In-One VR. + +
+
+ comment: 16 pages, 13 figures +
+
+
+
+
+ + ☆ AGG-Net: Attention Guided Gated-convolutional Network for Depth Image + Completion ICCV2023 + + +
+ Recently, stereo vision based on lightweight RGBD cameras has been widely +used in various fields. However, limited by the imaging principles, the +commonly used RGB-D cameras based on TOF, structured light, or binocular vision +acquire some invalid data inevitably, such as weak reflection, boundary +shadows, and artifacts, which may bring adverse impacts to the follow-up work. +In this paper, we propose a new model for depth image completion based on the +Attention Guided Gated-convolutional Network (AGG-Net), through which more +accurate and reliable depth images can be obtained from the raw depth maps and +the corresponding RGB images. Our model employs a UNet-like architecture which +consists of two parallel branches of depth and color features. In the encoding +stage, an Attention Guided Gated-Convolution (AG-GConv) module is proposed to +realize the fusion of depth and color features at different scales, which can +effectively reduce the negative impacts of invalid depth data on the +reconstruction. In the decoding stage, an Attention Guided Skip Connection +(AG-SC) module is presented to avoid introducing too many depth-irrelevant +features to the reconstruction. The experimental results demonstrate that our +method outperforms the state-of-the-art methods on the popular benchmarks +NYU-Depth V2, DIML, and SUN RGB-D. + +
+
+ comment: 9 pages, 7 figures, ICCV2023 +
+
+
+
+
+ + ♻ ☆ Ray Conditioning: Trading Photo-consistency for Photo-realism in + Multi-view Image Generation ICCV 2023 + + +
+ Multi-view image generation attracts particular attention these days due to +its promising 3D-related applications, e.g., image viewpoint editing. Most +existing methods follow a paradigm where a 3D representation is first +synthesized, and then rendered into 2D images to ensure photo-consistency +across viewpoints. However, such explicit bias for photo-consistency sacrifices +photo-realism, causing geometry artifacts and loss of fine-scale details when +these methods are applied to edit real images. To address this issue, we +propose ray conditioning, a geometry-free alternative that relaxes the +photo-consistency constraint. Our method generates multi-view images by +conditioning a 2D GAN on a light field prior. With explicit viewpoint control, +state-of-the-art photo-realism and identity consistency, our method is +particularly suited for the viewpoint editing task. + +
+
+ comment: ICCV 2023 paper. Project page at https://ray-cond.github.io/ +
+
+
+
+
+ + ♻ ☆ GLFF: Global and Local Feature Fusion for AI-synthesized Image Detection + + +
+ With the rapid development of deep generative models (such as Generative +Adversarial Networks and Diffusion models), AI-synthesized images are now of +such high quality that humans can hardly distinguish them from pristine ones. +Although existing detection methods have shown high performance in specific +evaluation settings, e.g., on images from seen models or on images without +real-world post-processing, they tend to suffer serious performance degradation +in real-world scenarios where testing images can be generated by more powerful +generation models or combined with various post-processing operations. To +address this issue, we propose a Global and Local Feature Fusion (GLFF) +framework to learn rich and discriminative representations by combining +multi-scale global features from the whole image with refined local features +from informative patches for AI synthesized image detection. GLFF fuses +information from two branches: the global branch to extract multi-scale +semantic features and the local branch to select informative patches for +detailed local artifacts extraction. Due to the lack of a synthesized image +dataset simulating real-world applications for evaluation, we further create a +challenging fake image dataset, named DeepFakeFaceForensics (DF 3 ), which +contains 6 state-of-the-art generation models and a variety of post-processing +techniques to approach the real-world scenarios. Experimental results +demonstrate the superiority of our method to the state-of-the-art methods on +the proposed DF 3 dataset and three other open-source datasets. + +
+
+ comment: 13 pages, 6 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Flow-based Spatio-Temporal Structured Prediction of Motion Dynamics + + +
+ Conditional Normalizing Flows (CNFs) are flexible generative models capable +of representing complicated distributions with high dimensionality and large +interdimensional correlations, making them appealing for structured output +learning. Their effectiveness in modelling multivariates spatio-temporal +structured data has yet to be completely investigated. We propose MotionFlow as +a novel normalizing flows approach that autoregressively conditions the output +distributions on the spatio-temporal input features. It combines deterministic +and stochastic representations with CNFs to create a probabilistic neural +generative approach that can model the variability seen in high dimensional +structured spatio-temporal data. We specifically propose to use conditional +priors to factorize the latent space for the time dependent modeling. We also +exploit the use of masked convolutions as autoregressive conditionals in CNFs. +As a result, our method is able to define arbitrarily expressive output +probability distributions under temporal dynamics in multivariate prediction +tasks. We apply our method to different tasks, including trajectory prediction, +motion prediction, time series forecasting, and binary segmentation, and +demonstrate that our model is able to leverage normalizing flows to learn +complicated time dependent conditional distributions. + +
+
+ comment: 13 pages, LaTeX; typos corrected, updated, in IEEE Transactions on + Pattern Analysis and Machine Intelligence +
+
+
+
+
+ + ♻ ☆ LatentSwap3D: Semantic Edits on 3D Image GANs ICCV'23 + + +
+ 3D GANs have the ability to generate latent codes for entire 3D volumes +rather than only 2D images. These models offer desirable features like +high-quality geometry and multi-view consistency, but, unlike their 2D +counterparts, complex semantic image editing tasks for 3D GANs have only been +partially explored. To address this problem, we propose LatentSwap3D, a +semantic edit approach based on latent space discovery that can be used with +any off-the-shelf 3D or 2D GAN model and on any dataset. LatentSwap3D relies on +identifying the latent code dimensions corresponding to specific attributes by +feature ranking using a random forest classifier. It then performs the edit by +swapping the selected dimensions of the image being edited with the ones from +an automatically selected reference image. Compared to other latent space +control-based edit methods, which were mainly designed for 2D GANs, our method +on 3D GANs provides remarkably consistent semantic edits in a disentangled +manner and outperforms others both qualitatively and quantitatively. We show +results on seven 3D GANs (pi-GAN, GIRAFFE, StyleSDF, MVCGAN, EG3D, StyleNeRF, +and VolumeGAN) and on five datasets (FFHQ, AFHQ, Cats, MetFaces, and CompCars). + +
+
+ comment: The paper has been accepted by ICCV'23 AI3DCC +
+
+
+
+
+ + ♻ ☆ Contrastive Learning for Self-Supervised Pre-Training of Point Cloud + Segmentation Networks With Image Data + + +
+ Reducing the quantity of annotations required for supervised training is +vital when labels are scarce and costly. This reduction is particularly +important for semantic segmentation tasks involving 3D datasets, which are +often significantly smaller and more challenging to annotate than their +image-based counterparts. Self-supervised pre-training on unlabelled data is +one way to reduce the amount of manual annotations needed. Previous work has +focused on pre-training with point clouds exclusively. While useful, this +approach often requires two or more registered views. In the present work, we +combine image and point cloud modalities by first learning self-supervised +image features and then using these features to train a 3D model. By +incorporating image data, which is often included in many 3D datasets, our +pre-training method only requires a single scan of a scene and can be applied +to cases where localization information is unavailable. We demonstrate that our +pre-training approach, despite using single scans, achieves comparable +performance to other multi-scan, point cloud-only methods. + +
+
+ comment: In Proceedings of the Conference on Robots and Vision (CRV'23), + Montreal, Canada, Jun. 6-8, 2023. arXiv admin note: substantial text overlap + with arXiv:2211.11801 +
+
+
+
+
+ + ♻ ☆ Living in a Material World: Learning Material Properties from + Full-Waveform Flash Lidar Data for Semantic Segmentation + + +
+ Advances in lidar technology have made the collection of 3D point clouds fast +and easy. While most lidar sensors return per-point intensity (or reflectance) +values along with range measurements, flash lidar sensors are able to provide +information about the shape of the return pulse. The shape of the return +waveform is affected by many factors, including the distance that the light +pulse travels and the angle of incidence with a surface. Importantly, the shape +of the return waveform also depends on the material properties of the +reflecting surface. In this paper, we investigate whether the material type or +class can be determined from the full-waveform response. First, as a proof of +concept, we demonstrate that the extra information about material class, if +known accurately, can improve performance on scene understanding tasks such as +semantic segmentation. Next, we learn two different full-waveform material +classifiers: a random forest classifier and a temporal convolutional neural +network (TCN) classifier. We find that, in some cases, material types can be +distinguished, and that the TCN generally performs better across a wider range +of materials. However, factors such as angle of incidence, material colour, and +material similarity may hinder overall performance. + +
+
+ comment: In Proceedings of the Conference on Robots and Vision (CRV'23), + Montreal, Canada, Jun. 6-8, 2023 +
+
+
+
+
+ + ♻ ☆ aUToLights: A Robust Multi-Camera Traffic Light Detection and Tracking + System + + +
+ Following four successful years in the SAE AutoDrive Challenge Series I, the +University of Toronto is participating in the Series II competition to develop +a Level 4 autonomous passenger vehicle capable of handling various urban +driving scenarios by 2025. Accurate detection of traffic lights and correct +identification of their states is essential for safe autonomous operation in +cities. Herein, we describe our recently-redesigned traffic light perception +system for autonomous vehicles like the University of Toronto's self-driving +car, Artemis. Similar to most traffic light perception systems, we rely +primarily on camera-based object detectors. We deploy the YOLOv5 detector for +bounding box regression and traffic light classification across multiple +cameras and fuse the observations. To improve robustness, we incorporate priors +from high-definition semantic maps and perform state filtering using hidden +Markov models. We demonstrate a multi-camera, real time-capable traffic light +perception pipeline that handles complex situations including multiple visible +intersections, traffic light variations, temporary occlusion, and flashing +light states. To validate our system, we collected and annotated a varied +dataset incorporating flashing states and a range of occlusion types. Our +results show superior performance in challenging real-world scenarios compared +to single-frame, single-camera object detection. + +
+
+ comment: In Proceedings of the Conference on Robots and Vision (CRV'23), + Montreal, Canada, Jun. 6-8, 2023 +
+
+
+
+
+ + ♻ ☆ 360BEV: Panoramic Semantic Mapping for Indoor Bird's-Eye View WACV 2024 + + +
+ Seeing only a tiny part of the whole is not knowing the full circumstance. +Bird's-eye-view (BEV) perception, a process of obtaining allocentric maps from +egocentric views, is restricted when using a narrow Field of View (FoV) alone. +In this work, mapping from 360{\deg} panoramas to BEV semantics, the 360BEV +task, is established for the first time to achieve holistic representations of +indoor scenes in a top-down view. Instead of relying on narrow-FoV image +sequences, a panoramic image with depth information is sufficient to generate a +holistic BEV semantic map. To benchmark 360BEV, we present two indoor datasets, +360BEV-Matterport and 360BEV-Stanford, both of which include egocentric +panoramic images and semantic segmentation labels, as well as allocentric +semantic maps. Besides delving deep into different mapping paradigms, we +propose a dedicated solution for panoramic semantic mapping, namely 360Mapper. +Through extensive experiments, our methods achieve 44.32% and 45.78% in mIoU on +both datasets respectively, surpassing previous counterparts with gains of ++7.60% and +9.70% in mIoU. Code and datasets are available at the project page: +https://jamycheung.github.io/360BEV.html. + +
+
+ comment: Code and datasets are available at the project page: + https://jamycheung.github.io/360BEV.html. Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation + + +
+ Current 3D open-vocabulary scene understanding methods mostly utilize +well-aligned 2D images as the bridge to learn 3D features with language. +However, applying these approaches becomes challenging in scenarios where 2D +images are absent. In this work, we introduce a completely new pipeline, +namely, OpenIns3D, which requires no 2D image inputs, for 3D open-vocabulary +scene understanding at the instance level. The OpenIns3D framework employs a +"Mask-Snap-Lookup" scheme. The "Mask" module learns class-agnostic mask +proposals in 3D point clouds. The "Snap" module generates synthetic scene-level +images at multiple scales and leverages 2D vision language models to extract +interesting objects. The "Lookup" module searches through the outcomes of +"Snap" with the help of Mask2Pixel maps, which contain the precise +correspondence between 3D masks and synthetic images, to assign category names +to the proposed masks. This 2D input-free, easy-to-train, and flexible approach +achieved state-of-the-art results on a wide range of indoor and outdoor +datasets with a large margin. Furthermore, OpenIns3D allows for effortless +switching of 2D detectors without re-training. When integrated with +state-of-the-art 2D open-world models such as ODISE and GroundingDINO, superb +results are observed on open-vocabulary instance segmentation. When integrated +with LLM-powered 2D models like LISA, it demonstrates a remarkable capacity to +process highly complex text queries, including those that require intricate +reasoning and world knowledge. Project page: +https://zheninghuang.github.io/OpenIns3D/ + +
+
+ comment: 24 pages, 16 figures, 13 tables. Project page: + https://zheninghuang.github.io/OpenIns3D/ +
+
+
+
+
+ + ♻ ☆ Improving NeRF Quality by Progressive Camera Placement for Unrestricted + Navigation in Complex Environments + + +
+ Neural Radiance Fields, or NeRFs, have drastically improved novel view +synthesis and 3D reconstruction for rendering. NeRFs achieve impressive results +on object-centric reconstructions, but the quality of novel view synthesis with +free-viewpoint navigation in complex environments (rooms, houses, etc) is often +problematic. While algorithmic improvements play an important role in the +resulting quality of novel view synthesis, in this work, we show that because +optimizing a NeRF is inherently a data-driven process, good quality data play a +fundamental role in the final quality of the reconstruction. As a consequence, +it is critical to choose the data samples -- in this case the cameras -- in a +way that will eventually allow the optimization to converge to a solution that +allows free-viewpoint navigation with good quality. Our main contribution is an +algorithm that efficiently proposes new camera placements that improve visual +quality with minimal assumptions. Our solution can be used with any NeRF model +and outperforms baselines and similar work. + +
+
+
+
+
+ + ♻ ☆ Efficient HDR Reconstruction From Real-World Raw Images + + +
+ High dynamic range (HDR) imaging is a significant yet challenging problem due +to the limited dynamic range of generic image sensors. Most existing +learning-based HDR reconstruction methods take a set of bracketed exposure sRGB +images to extend the dynamic range. However, they overlook the computational +and memory inefficiencies of Image Signal Processors (ISPs) when processing a +set of sRGB images with different exposures. Furthermore, the absence of +large-scale raw-based HDR datasets limits the research on HDR imaging. In this +work, in a new aspect, we discover an excellent opportunity for HDR +reconstructing directly from raw images and investigating novel neural network +structures that benefit the deployment of mobile devices. Meanwhile, we +construct a new HDR dataset containing raw images and process to obtain sRGB +images and design a new model to reconstruct HDR utilizing the unique +characteristics of long- and short-exposure images. Our key insights are +threefold: (1) a new computational raw LDR-HDR pair formation pipeline is +designed to construct a real-world raw HDR dataset called RealRaw-HDR; (2) a +lightweight-efficient HDR model, RepUNet, is developed using the structural +reparameterization technique; (3) a plug-and-play alignment-free and +motion-aware short-exposure-first selection loss and a colorfulness loss are +proposed to mitigate ghost artifacts and color cast. Extensive experiment +results demonstrate that our approach achieves state-of-the-art performance in +both visual quality and quantitative metrics. + +
+
+
+
+
+ + ♻ ☆ Learnable Differencing Center for Nighttime Depth Perception + + +
+ Depth completion is the task of recovering dense depth maps from sparse ones, +usually with the help of color images. Existing image-guided methods perform +well on daytime depth perception self-driving benchmarks, but struggle in +nighttime scenarios with poor visibility and complex illumination. To address +these challenges, we propose a simple yet effective framework called LDCNet. +Our key idea is to use Recurrent Inter-Convolution Differencing (RICD) and +Illumination-Affinitive Intra-Convolution Differencing (IAICD) to enhance the +nighttime color images and reduce the negative effects of the varying +illumination, respectively. RICD explicitly estimates global illumination by +differencing two convolutions with different kernels, treating the +small-kernel-convolution feature as the center of the large-kernel-convolution +feature in a new perspective. IAICD softly alleviates local relative light +intensity by differencing a single convolution, where the center is dynamically +aggregated based on neighboring pixels and the estimated illumination map in +RICD. On both nighttime depth completion and depth estimation tasks, extensive +experiments demonstrate the effectiveness of our LDCNet, reaching the state of +the art. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ TouchStone: Evaluating Vision-Language Models by Language Models + + +
+ Large vision-language models (LVLMs) have recently witnessed rapid +advancements, exhibiting a remarkable capacity for perceiving, understanding, +and processing visual information by connecting visual receptor with large +language models (LLMs). However, current assessments mainly focus on +recognizing and reasoning abilities, lacking direct evaluation of +conversational skills and neglecting visual storytelling abilities. In this +paper, we propose an evaluation method that uses strong LLMs as judges to +comprehensively evaluate the various abilities of LVLMs. Firstly, we construct +a comprehensive visual dialogue dataset TouchStone, consisting of open-world +images and questions, covering five major categories of abilities and 27 +subtasks. This dataset not only covers fundamental recognition and +comprehension but also extends to literary creation. Secondly, by integrating +detailed image annotations we effectively transform the multimodal input +content into a form understandable by LLMs. This enables us to employ advanced +LLMs for directly evaluating the quality of the multimodal dialogue without +requiring human intervention. Through validation, we demonstrate that powerful +LVLMs, such as GPT-4, can effectively score dialogue quality by leveraging +their textual capabilities alone, aligning with human preferences. We hope our +work can serve as a touchstone for LVLMs' evaluation and pave the way for +building stronger LVLMs. The evaluation code is available at +https://github.com/OFA-Sys/TouchStone. + +
+
+ comment: https://github.com/OFA-Sys/TouchStone +
+
+
+
+
+ + ♻ ☆ Focusing on what to decode and what to train: Efficient Training with + HOI Split Decoders and Specific Target Guided DeNoising + + +
+ Recent one-stage transformer-based methods achieve notable gains in the +Human-object Interaction Detection (HOI) task by leveraging the detection of +DETR. However, the current methods redirect the detection target of the object +decoder, and the box target is not explicitly separated from the query +embeddings, which leads to long and hard training. Furthermore, matching the +predicted HOI instances with the ground-truth is more challenging than object +detection, simply adapting training strategies from the object detection makes +the training more difficult. To clear the ambiguity between human and object +detection and share the prediction burden, we propose a novel one-stage +framework (SOV), which consists of a subject decoder, an object decoder, and a +verb decoder. Moreover, we propose a novel Specific Target Guided (STG) +DeNoising training strategy, which leverages learnable object and verb label +embeddings to guide the training and accelerate the training convergence. In +addition, for the inference part, the label-specific information is directly +fed into the decoders by initializing the query embeddings from the learnable +label embeddings. Without additional features or prior language knowledge, our +method (SOV-STG) achieves higher accuracy than the state-of-the-art method in +one-third of training epochs. The code is available at this +https://github.com/cjw2021/SOV-STG. + +
+
+
+
+
+ + ♻ ☆ Group-based Robustness: A General Framework for Customized Robustness in + the Real World + + +
+ Machine-learning models are known to be vulnerable to evasion attacks that +perturb model inputs to induce misclassifications. In this work, we identify +real-world scenarios where the true threat cannot be assessed accurately by +existing attacks. Specifically, we find that conventional metrics measuring +targeted and untargeted robustness do not appropriately reflect a model's +ability to withstand attacks from one set of source classes to another set of +target classes. To address the shortcomings of existing methods, we formally +define a new metric, termed group-based robustness, that complements existing +metrics and is better-suited for evaluating model performance in certain attack +scenarios. We show empirically that group-based robustness allows us to +distinguish between models' vulnerability against specific threat models in +situations where traditional robustness metrics do not apply. Moreover, to +measure group-based robustness efficiently and accurately, we 1) propose two +loss functions and 2) identify three new attack strategies. We show empirically +that with comparable success rates, finding evasive samples using our new loss +functions saves computation by a factor as large as the number of targeted +classes, and finding evasive samples using our new attack strategies saves time +by up to 99\% compared to brute-force search methods. Finally, we propose a +defense method that increases group-based robustness by up to 3.52$\times$. + +
+
+
+
+
+ + ♻ ☆ FOF: Learning Fourier Occupancy Field for Monocular Real-time Human + Reconstruction + + +
+ The advent of deep learning has led to significant progress in monocular +human reconstruction. However, existing representations, such as parametric +models, voxel grids, meshes and implicit neural representations, have +difficulties achieving high-quality results and real-time speed at the same +time. In this paper, we propose Fourier Occupancy Field (FOF), a novel +powerful, efficient and flexible 3D representation, for monocular real-time and +accurate human reconstruction. The FOF represents a 3D object with a 2D field +orthogonal to the view direction where at each 2D position the occupancy field +of the object along the view direction is compactly represented with the first +few terms of Fourier series, which retains the topology and neighborhood +relation in the 2D domain. A FOF can be stored as a multi-channel image, which +is compatible with 2D convolutional neural networks and can bridge the gap +between 3D geometries and 2D images. The FOF is very flexible and extensible, +e.g., parametric models can be easily integrated into a FOF as a prior to +generate more robust results. Based on FOF, we design the first 30+FPS +high-fidelity real-time monocular human reconstruction framework. We +demonstrate the potential of FOF on both public dataset and real captured data. +The code will be released for research purposes. + +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ DiscoverPath: A Knowledge Refinement and Retrieval System for + Interdisciplinarity on Biomedical Research + + +
+ The exponential growth in scholarly publications necessitates advanced tools +for efficient article retrieval, especially in interdisciplinary fields where +diverse terminologies are used to describe similar research. Traditional +keyword-based search engines often fall short in assisting users who may not be +familiar with specific terminologies. To address this, we present a knowledge +graph-based paper search engine for biomedical research to enhance the user +experience in discovering relevant queries and articles. The system, dubbed +DiscoverPath, employs Named Entity Recognition (NER) and part-of-speech (POS) +tagging to extract terminologies and relationships from article abstracts to +create a KG. To reduce information overload, DiscoverPath presents users with a +focused subgraph containing the queried entity and its neighboring nodes and +incorporates a query recommendation system, enabling users to iteratively +refine their queries. The system is equipped with an accessible Graphical User +Interface that provides an intuitive visualization of the KG, query +recommendations, and detailed article information, enabling efficient article +retrieval, thus fostering interdisciplinary knowledge exploration. DiscoverPath +is open-sourced at https://github.com/ynchuang/DiscoverPath. + +
+
+
+
+
+ + ☆ CRUISE-Screening: Living Literature Reviews Toolbox CIKM 2023 + + +
+ Keeping up with research and finding related work is still a time-consuming +task for academics. Researchers sift through thousands of studies to identify a +few relevant ones. Automation techniques can help by increasing the efficiency +and effectiveness of this task. To this end, we developed CRUISE-Screening, a +web-based application for conducting living literature reviews - a type of +literature review that is continuously updated to reflect the latest research +in a particular field. CRUISE-Screening is connected to several search engines +via an API, which allows for updating the search results periodically. +Moreover, it can facilitate the process of screening for relevant publications +by using text classification and question answering models. CRUISE-Screening +can be used both by researchers conducting literature reviews and by those +working on automating the citation screening process to validate their +algorithms. The application is open-source: +https://github.com/ProjectDoSSIER/cruise-screening, and a demo is available +under this URL: https://citation-screening.ec.tuwien.ac.at. We discuss the +limitations of our tool in Appendix A. + +
+
+ comment: Paper accepted at CIKM 2023. The arXiv version has an extra section + about limitations in the Appendix that is not present in the ACM version +
+
+
+
+
+ + ☆ Fair Ranking under Disparate Uncertainty UAI + + +
+ Ranking is a ubiquitous method for focusing the attention of human evaluators +on a manageable subset of options. Its use ranges from surfacing potentially +relevant products on an e-commerce site to prioritizing college applications +for human review. While ranking can make human evaluation far more effective by +focusing attention on the most promising options, we argue that it can +introduce unfairness if the uncertainty of the underlying relevance model +differs between groups of options. Unfortunately, such disparity in uncertainty +appears widespread, since the relevance estimates for minority groups tend to +have higher uncertainty due to a lack of data or appropriate features. To +overcome this fairness issue, we propose Equal-Opportunity Ranking (EOR) as a +new fairness criterion for ranking that provably corrects for the disparity in +uncertainty between groups. Furthermore, we present a practical algorithm for +computing EOR rankings in time $O(n \log(n))$ and prove its close approximation +guarantee to the globally optimal solution. In a comprehensive empirical +evaluation on synthetic data, a US Census dataset, and a real-world case study +of Amazon search queries, we find that the algorithm reliably guarantees EOR +fairness while providing effective rankings. + +
+
+ comment: A version of this paper was accepted as Spotlight (Oral) at UAI + workshop on Epistemic in AI, 2023 +
+
+
+
+
+ + ☆ OutRank: Speeding up AutoML-based Model Search for Large Sparse Data + sets with Cardinality-aware Feature Ranking RecSys2023 + + +
+ The design of modern recommender systems relies on understanding which parts +of the feature space are relevant for solving a given recommendation task. +However, real-world data sets in this domain are often characterized by their +large size, sparsity, and noise, making it challenging to identify meaningful +signals. Feature ranking represents an efficient branch of algorithms that can +help address these challenges by identifying the most informative features and +facilitating the automated search for more compact and better-performing models +(AutoML). We introduce OutRank, a system for versatile feature ranking and data +quality-related anomaly detection. OutRank was built with categorical data in +mind, utilizing a variant of mutual information that is normalized with regard +to the noise produced by features of the same cardinality. We further extend +the similarity measure by incorporating information on feature similarity and +combined relevance. The proposed approach's feasibility is demonstrated by +speeding up the state-of-the-art AutoML system on a synthetic data set with no +performance loss. Furthermore, we considered a real-life click-through-rate +prediction data set where it outperformed strong baselines such as random +forest-based approaches. The proposed approach enables exploration of up to +300% larger feature spaces compared to AutoML-only approaches, enabling faster +search for better models on off-the-shelf hardware. + +
+
+ comment: accepted to RecSys2023 +
+
+
+
+
+ + ☆ Interactive Graph Convolutional Filtering + + +
+ Interactive Recommender Systems (IRS) have been increasingly used in various +domains, including personalized article recommendation, social media, and +online advertising. However, IRS faces significant challenges in providing +accurate recommendations under limited observations, especially in the context +of interactive collaborative filtering. These problems are exacerbated by the +cold start problem and data sparsity problem. Existing Multi-Armed Bandit +methods, despite their carefully designed exploration strategies, often +struggle to provide satisfactory results in the early stages due to the lack of +interaction data. Furthermore, these methods are computationally intractable +when applied to non-linear models, limiting their applicability. To address +these challenges, we propose a novel method, the Interactive Graph +Convolutional Filtering model. Our proposed method extends interactive +collaborative filtering into the graph model to enhance the performance of +collaborative filtering between users and items. We incorporate variational +inference techniques to overcome the computational hurdles posed by non-linear +models. Furthermore, we employ Bayesian meta-learning methods to effectively +address the cold-start problem and derive theoretical regret bounds for our +proposed method, ensuring a robust performance guarantee. Extensive +experimental results on three real-world datasets validate our method and +demonstrate its superiority over existing baselines. + +
+
+
+
+
+ + ☆ This Is a Local Domain: On Amassing Country-Code Top-Level Domains from + Public Data + + +
+ Domain lists are a key ingredient for representative censuses of the Web. +Unfortunately, such censuses typically lack a view on domains under +country-code top-level domains (ccTLDs). This introduces unwanted bias: many +countries have a rich local Web that remains hidden if their ccTLDs are not +considered. The reason ccTLDs are rarely considered is that gaining access -- +if possible at all -- is often laborious. To tackle this, we ask: what can we +learn about ccTLDs from public sources? We extract domain names under ccTLDs +from 6 years of public data from Certificate Transparency logs and Common +Crawl. We compare this against ground truth for 19 ccTLDs for which we have the +full DNS zone. We find that public data covers 43%-80% of these ccTLDs, and +that coverage grows over time. By also comparing port scan data we then show +that these public sources reveal a significant part of the Web presence under a +ccTLD. We conclude that in the absence of full access to ccTLDs, domain names +learned from public sources can be a good proxy when performing Web censuses. + +
+
+ comment: 6 pages double-column, 4 figures; submitted to ACM SIGCOMM CCR +
+
+
+
+
+ + ☆ AVATAR: Robust Voice Search Engine Leveraging Autoregressive Document + Retrieval and Contrastive Learning + + +
+ Voice, as input, has progressively become popular on mobiles and seems to +transcend almost entirely text input. Through voice, the voice search (VS) +system can provide a more natural way to meet user's information needs. +However, errors from the automatic speech recognition (ASR) system can be +catastrophic to the VS system. Building on the recent advanced lightweight +autoregressive retrieval model, which has the potential to be deployed on +mobiles, leading to a more secure and personal VS assistant. This paper +presents a novel study of VS leveraging autoregressive retrieval and tackles +the crucial problems facing VS, viz. the performance drop caused by ASR noise, +via data augmentations and contrastive learning, showing how explicit and +implicit modeling the noise patterns can alleviate the problems. A series of +experiments conducted on the Open-Domain Question Answering (ODSQA) confirm our +approach's effectiveness and robustness in relation to some strong baseline +systems. + +
+
+
+
+
+ + ☆ PreprintResolver: Improving Citation Quality by Resolving Published + Versions of ArXiv Preprints using Literature Databases + + +
+ The growing impact of preprint servers enables the rapid sharing of +time-sensitive research. Likewise, it is becoming increasingly difficult to +distinguish high-quality, peer-reviewed research from preprints. Although +preprints are often later published in peer-reviewed journals, this information +is often missing from preprint servers. To overcome this problem, the +PreprintResolver was developed, which uses four literature databases (DBLP, +SemanticScholar, OpenAlex, and CrossRef / CrossCite) to identify +preprint-publication pairs for the arXiv preprint server. The target audience +focuses on, but is not limited to inexperienced researchers and students, +especially from the field of computer science. The tool is based on a fuzzy +matching of author surnames, titles, and DOIs. Experiments were performed on a +sample of 1,000 arXiv-preprints from the research field of computer science and +without any publication information. With 77.94 %, computer science is highly +affected by missing publication information in arXiv. The results show that the +PreprintResolver was able to resolve 603 out of 1,000 (60.3 %) arXiv-preprints +from the research field of computer science and without any publication +information. All four literature databases contributed to the final result. In +a manual validation, a random sample of 100 resolved preprints was checked. For +all preprints, at least one result is plausible. For nine preprints, more than +one result was identified, three of which are partially invalid. In conclusion +the PreprintResolver is suitable for individual, manually reviewed requests, +but less suitable for bulk requests. The PreprintResolver tool +(https://preprintresolver.eu, Available from 2023-08-01) and source code +(https://gitlab.com/ippolis_wp3/preprint-resolver, Accessed: 2023-07-19) is +available online. + +
+
+ comment: Accepted for International Conference on Theory and Practice of + Digital Libraries (TPDL 2023) +
+
+
+
+
+ + ☆ ReOnto: A Neuro-Symbolic Approach for Biomedical Relation Extraction ECML 2023 + + +
+ Relation Extraction (RE) is the task of extracting semantic relationships +between entities in a sentence and aligning them to relations defined in a +vocabulary, which is generally in the form of a Knowledge Graph (KG) or an +ontology. Various approaches have been proposed so far to address this task. +However, applying these techniques to biomedical text often yields +unsatisfactory results because it is hard to infer relations directly from +sentences due to the nature of the biomedical relations. To address these +issues, we present a novel technique called ReOnto, that makes use of neuro +symbolic knowledge for the RE task. ReOnto employs a graph neural network to +acquire the sentence representation and leverages publicly accessible +ontologies as prior knowledge to identify the sentential relation between two +entities. The approach involves extracting the relation path between the two +entities from the ontology. We evaluate the effect of using symbolic knowledge +from ontologies with graph neural networks. Experimental results on two public +biomedical datasets, BioRel and ADE, show that our method outperforms all the +baselines (approximately by 3\%). + +
+
+ comment: Accepted in ECML 2023 +
+
+
+
+
+ + ☆ Distributional Domain-Invariant Preference Matching for Cross-Domain + Recommendation ICDM 2023 + + +
+ Learning accurate cross-domain preference mappings in the absence of +overlapped users/items has presented a persistent challenge in Non-overlapping +Cross-domain Recommendation (NOCDR). Despite the efforts made in previous +studies to address NOCDR, several limitations still exist. Specifically, 1) +while some approaches substitute overlapping users/items with overlapping +behaviors, they cannot handle NOCDR scenarios where such auxiliary information +is unavailable; 2) often, cross-domain preference mapping is modeled by +learning deterministic explicit representation matchings between sampled users +in two domains. However, this can be biased due to individual preferences and +thus fails to incorporate preference continuity and universality of the general +population. In light of this, we assume that despite the scattered nature of +user behaviors, there exists a consistent latent preference distribution shared +among common people. Modeling such distributions further allows us to capture +the continuity in user behaviors within each domain and discover preference +invariance across domains. To this end, we propose a Distributional +domain-invariant Preference Matching method for non-overlapping Cross-Domain +Recommendation (DPMCDR). For each domain, we hierarchically approximate a +posterior of domain-level preference distribution with empirical evidence +derived from user-item interactions. Next, we aim to build distributional +implicit matchings between the domain-level preferences of two domains. This +process involves mapping them to a shared latent space and seeking a consensus +on domain-invariant preference by minimizing the distance between their +distributional representations therein. In this way, we can identify the +alignment of two non-overlapping domains if they exhibit similar patterns of +domain-invariant preference. + +
+
+ comment: 9 pages, 5 figures, full research paper accepted by ICDM 2023 +
+
+
+
+
+ + ☆ In-processing User Constrained Dominant Sets for User-Oriented Fairness + in Recommender Systems + + +
+ Recommender systems are typically biased toward a small group of users, +leading to severe unfairness in recommendation performance, i.e., User-Oriented +Fairness (UOF) issue. The existing research on UOF is limited and fails to deal +with the root cause of the UOF issue: the learning process between advantaged +and disadvantaged users is unfair. To tackle this issue, we propose an +In-processing User Constrained Dominant Sets (In-UCDS) framework, which is a +general framework that can be applied to any backbone recommendation model to +achieve user-oriented fairness. We split In-UCDS into two stages, i.e., the +UCDS modeling stage and the in-processing training stage. In the UCDS modeling +stage, for each disadvantaged user, we extract a constrained dominant set (a +user cluster) containing some advantaged users that are similar to it. In the +in-processing training stage, we move the representations of disadvantaged +users closer to their corresponding cluster by calculating a fairness loss. By +combining the fairness loss with the original backbone model loss, we address +the UOF issue and maintain the overall recommendation performance +simultaneously. Comprehensive experiments on three real-world datasets +demonstrate that In-UCDS outperforms the state-of-the-art methods, leading to a +fairer model with better overall recommendation performance. + +
+
+
+
+
+ + ♻ ☆ Iteratively Learning Representations for Unseen Entities with Inter-Rule + Correlations CIKM 2023 + + +
+ Recent work on knowledge graph completion (KGC) focused on learning +embeddings of entities and relations in knowledge graphs. These embedding +methods require that all test entities are observed at training time, resulting +in a time-consuming retraining process for out-of-knowledge-graph (OOKG) +entities. To address this issue, current inductive knowledge embedding methods +employ graph neural networks (GNNs) to represent unseen entities by aggregating +information of known neighbors. They face three important challenges: (i) data +sparsity, (ii) the presence of complex patterns in knowledge graphs (e.g., +inter-rule correlations), and (iii) the presence of interactions among rule +mining, rule inference, and embedding. In this paper, we propose a virtual +neighbor network with inter-rule correlations (VNC) that consists of three +stages: (i) rule mining, (ii) rule inference, and (iii) embedding. In the rule +mining process, to identify complex patterns in knowledge graphs, both logic +rules and inter-rule correlations are extracted from knowledge graphs based on +operations over relation embeddings. To reduce data sparsity, virtual neighbors +for OOKG entities are predicted and assigned soft labels by optimizing a +rule-constrained problem. We also devise an iterative framework to capture the +underlying relations between rule learning and embedding learning. In our +experiments, results on both link prediction and triple classification tasks +show that the proposed VNC framework achieves state-of-the-art performance on +four widely-used knowledge graphs. Further analysis reveals that VNC is robust +to the proportion of unseen entities and effectively mitigates data sparsity. + +
+
+ comment: Accepted at CIKM 2023 +
+
+
+
+
+ + ♻ ☆ MemoNet: Memorizing All Cross Features' Representations Efficiently via + Multi-Hash Codebook Network for CTR Prediction + + +
+ New findings in natural language processing (NLP) demonstrate that the strong +memorization capability contributes a lot to the success of Large Language +Models (LLM). This inspires us to explicitly bring an independent memory +mechanism into CTR ranking model to learn and memorize cross features' +representations. In this paper, we propose multi-Hash Codebook NETwork (HCNet) +as the memory mechanism for efficiently learning and memorizing representations +of cross features in CTR tasks. HCNet uses a multi-hash codebook as the main +memory place and the whole memory procedure consists of three phases: +multi-hash addressing, memory restoring, and feature shrinking. We also propose +a new CTR model named MemoNet which combines HCNet with a DNN backbone. +Extensive experimental results on three public datasets and online test show +that MemoNet reaches superior performance over state-of-the-art approaches. +Besides, MemoNet shows scaling law of large language model in NLP, which means +we can enlarge the size of the codebook in HCNet to sustainably obtain +performance gains. Our work demonstrates the importance and feasibility of +learning and memorizing representations of cross features, which sheds light on +a new promising research direction. + +
+
+
+
+
+ + ♻ ☆ Retrieval-augmented GPT-3.5-based Text-to-SQL Framework with + Sample-aware Prompting and Dynamic Revision Chain + + +
+ Text-to-SQL aims at generating SQL queries for the given natural language +questions and thus helping users to query databases. Prompt learning with large +language models (LLMs) has emerged as a recent approach, which designs prompts +to lead LLMs to understand the input question and generate the corresponding +SQL. However, it faces challenges with strict SQL syntax requirements. Existing +work prompts the LLMs with a list of demonstration examples (i.e. question-SQL +pairs) to generate SQL, but the fixed prompts can hardly handle the scenario +where the semantic gap between the retrieved demonstration and the input +question is large. In this paper, we propose a retrieval-augmented prompting +method for a LLM-based Text-to-SQL framework, involving sample-aware prompting +and a dynamic revision chain. Our approach incorporates sample-aware +demonstrations, which include the composition of SQL operators and fine-grained +information related to the given question. To retrieve questions sharing +similar intents with input questions, we propose two strategies for assisting +retrieval. Firstly, we leverage LLMs to simplify the original questions, +unifying the syntax and thereby clarifying the users' intentions. To generate +executable and accurate SQLs without human intervention, we design a dynamic +revision chain which iteratively adapts fine-grained feedback from the +previously generated SQL. Experimental results on three Text-to-SQL benchmarks +demonstrate the superiority of our method over strong baseline models. + +
+
+
+
+
+
+
+
+ + Machine Learning 25 + +
+
+
+ + ☆ Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition + and Translation + + +
+ In this paper, we devise a mechanism for the addition of multi-modal +information with an existing pipeline for continuous sign language recognition +and translation. In our procedure, we have incorporated optical flow +information with RGB images to enrich the features with movement-related +information. This work studies the feasibility of such modality inclusion using +a cross-modal encoder. The plugin we have used is very lightweight and doesn't +need to include a separate feature extractor for the new modality in an +end-to-end manner. We have applied the changes in both sign language +recognition and translation, improving the result in each case. We have +evaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language +recognition and the RWTH-PHOENIX-2014T dataset for translation. On the +recognition task, our approach reduced the WER by 0.9, and on the translation +task, our approach increased most of the BLEU scores by ~0.6 on the test set. + +
+
+
+
+
+ + ☆ Efficient Defense Against Model Stealing Attacks on Convolutional Neural + Networks ICML + + +
+ Model stealing attacks have become a serious concern for deep learning +models, where an attacker can steal a trained model by querying its black-box +API. This can lead to intellectual property theft and other security and +privacy risks. The current state-of-the-art defenses against model stealing +attacks suggest adding perturbations to the prediction probabilities. However, +they suffer from heavy computations and make impracticable assumptions about +the adversary. They often require the training of auxiliary models. This can be +time-consuming and resource-intensive which hinders the deployment of these +defenses in real-world applications. In this paper, we propose a simple yet +effective and efficient defense alternative. We introduce a heuristic approach +to perturb the output probabilities. The proposed defense can be easily +integrated into models without additional training. We show that our defense is +effective in defending against three state-of-the-art stealing attacks. We +evaluate our approach on large and quantized (i.e., compressed) Convolutional +Neural Networks (CNNs) trained on several vision datasets. Our technique +outperforms the state-of-the-art defenses with a $\times37$ faster inference +latency without requiring any additional model and with a low impact on the +model's performance. We validate that our defense is also effective for +quantized CNNs targeting edge devices. + +
+
+ comment: Accepted for publication at 2023 International Conference on Machine + Learning and Applications (ICMLA) +
+
+
+
+
+ + ☆ Delegating Data Collection in Decentralized Machine Learning + + +
+ Motivated by the emergence of decentralized machine learning ecosystems, we +study the delegation of data collection. Taking the field of contract theory as +our starting point, we design optimal and near-optimal contracts that deal with +two fundamental machine learning challenges: lack of certainty in the +assessment of model quality and lack of knowledge regarding the optimal +performance of any model. We show that lack of certainty can be dealt with via +simple linear contracts that achieve 1-1/e fraction of the first-best utility, +even if the principal has a small test set. Furthermore, we give sufficient +conditions on the size of the principal's test set that achieves a vanishing +additive approximation to the optimal utility. To address the lack of a priori +knowledge regarding the optimal performance, we give a convex program that can +adaptively and efficiently compute the optimal contract. + +
+
+
+
+
+ + ☆ Soft-Dropout: A Practical Approach for Mitigating Overfitting in Quantum + Convolutional Neural Networks + + +
+ Quantum convolutional neural network (QCNN), an early application for quantum +computers in the NISQ era, has been consistently proven successful as a machine +learning (ML) algorithm for several tasks with significant accuracy. Derived +from its classical counterpart, QCNN is prone to overfitting. Overfitting is a +typical shortcoming of ML models that are trained too closely to the availed +training dataset and perform relatively poorly on unseen datasets for a similar +problem. In this work we study the adaptation of one of the most successful +overfitting mitigation method, knows as the (post-training) dropout method, to +the quantum setting. We find that a straightforward implementation of this +method in the quantum setting leads to a significant and undesirable +consequence: a substantial decrease in success probability of the QCNN. We +argue that this effect exposes the crucial role of entanglement in QCNNs and +the vulnerability of QCNNs to entanglement loss. To handle overfitting, we +proposed a softer version of the dropout method. We find that the proposed +method allows us to handle successfully overfitting in the test cases. + +
+
+ comment: 9 pages, 14 images, 6 tables +
+
+
+
+
+ + ☆ Secure and Efficient Federated Learning in LEO Constellations using + Decentralized Key Generation and On-Orbit Model Aggregation + + +
+ Satellite technologies have advanced drastically in recent years, leading to +a heated interest in launching small satellites into low Earth orbit (LEOs) to +collect massive data such as satellite imagery. Downloading these data to a +ground station (GS) to perform centralized learning to build an AI model is not +practical due to the limited and expensive bandwidth. Federated learning (FL) +offers a potential solution but will incur a very large convergence delay due +to the highly sporadic and irregular connectivity between LEO satellites and +GS. In addition, there are significant security and privacy risks where +eavesdroppers or curious servers/satellites may infer raw data from satellites' +model parameters transmitted over insecure communication channels. To address +these issues, this paper proposes FedSecure, a secure FL approach designed for +LEO constellations, which consists of two novel components: (1) decentralized +key generation that protects satellite data privacy using a functional +encryption scheme, and (2) on-orbit model forwarding and aggregation that +generates a partial global model per orbit to minimize the idle waiting time +for invisible satellites to enter the visible zone of the GS. Our analysis and +results show that FedSecure preserves the privacy of each satellite's data +against eavesdroppers, a curious server, or curious satellites. It is +lightweight with significantly lower communication and computation overheads +than other privacy-preserving FL aggregation approaches. It also reduces +convergence delay drastically from days to only a few hours, yet achieving high +accuracy of up to 85.35% using realistic satellite images. + +
+
+
+
+
+ + ☆ LoopTune: Optimizing Tensor Computations with Reinforcement Learning + + +
+ Advanced compiler technology is crucial for enabling machine learning +applications to run on novel hardware, but traditional compilers fail to +deliver performance, popular auto-tuners have long search times and +expert-optimized libraries introduce unsustainable costs. To address this, we +developed LoopTune, a deep reinforcement learning compiler that optimizes +tensor computations in deep learning models for the CPU. LoopTune optimizes +tensor traversal order while using the ultra-fast lightweight code generator +LoopNest to perform hardware-specific optimizations. With a novel graph-based +representation and action space, LoopTune speeds up LoopNest by 3.2x, +generating an order of magnitude faster code than TVM, 2.8x faster than +MetaSchedule, and 1.08x faster than AutoTVM, consistently performing at the +level of the hand-tuned library Numpy. Moreover, LoopTune tunes code in order +of seconds. + +
+
+
+
+
+ + ☆ On the fly Deep Neural Network Optimization Control for Low-Power + Computer Vision + + +
+ Processing visual data on mobile devices has many applications, e.g., +emergency response and tracking. State-of-the-art computer vision techniques +rely on large Deep Neural Networks (DNNs) that are usually too power-hungry to +be deployed on resource-constrained edge devices. Many techniques improve the +efficiency of DNNs by using sparsity or quantization. However, the accuracy and +efficiency of these techniques cannot be adapted for diverse edge applications +with different hardware constraints and accuracy requirements. This paper +presents a novel technique to allow DNNs to adapt their accuracy and energy +consumption during run-time, without the need for any re-training. Our +technique called AdaptiveActivation introduces a hyper-parameter that controls +the output range of the DNNs' activation function to dynamically adjust the +sparsity and precision in the DNN. AdaptiveActivation can be applied to any +existing pre-trained DNN to improve their deployability in diverse edge +environments. We conduct experiments on popular edge devices and show that the +accuracy is within 1.5% of the baseline. We also show that our approach +requires 10%--38% less memory than the baseline techniques leading to more +accuracy-efficiency tradeoff options + +
+
+
+
+
+ + ☆ Computation and Communication Efficient Federated Learning over Wireless + Networks + + +
+ Federated learning (FL) allows model training from local data by edge devices +while preserving data privacy. However, the learning accuracy decreases due to +the heterogeneity of devices data, and the computation and communication +latency increase when updating large scale learning models on devices with +limited computational capability and wireless resources. To overcome these +challenges, we consider a novel FL framework with partial model pruning and +personalization. This framework splits the learning model into a global part +with model pruning shared with all devices to learn data representations and a +personalized part to be fine tuned for a specific device, which adapts the +model size during FL to reduce both computation and communication overhead and +minimize the overall training time, and increases the learning accuracy for the +device with non independent and identically distributed (non IID) data. Then, +the computation and communication latency and the convergence analysis of the +proposed FL framework are mathematically analyzed. Based on the convergence +analysis, an optimization problem is formulated to maximize the convergence +rate under a latency threshold by jointly optimizing the pruning ratio and +wireless resource allocation. By decoupling the optimization problem and +deploying Karush Kuhn Tucker (KKT) conditions, we derive the closed form +solutions of pruning ratio and wireless resource allocation. Finally, +experimental results demonstrate that the proposed FL framework achieves a +remarkable reduction of approximately 50 percents computation and communication +latency compared with the scheme only with model personalization. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2305.09042 +
+
+
+
+
+ + ☆ DiscoverPath: A Knowledge Refinement and Retrieval System for + Interdisciplinarity on Biomedical Research + + +
+ The exponential growth in scholarly publications necessitates advanced tools +for efficient article retrieval, especially in interdisciplinary fields where +diverse terminologies are used to describe similar research. Traditional +keyword-based search engines often fall short in assisting users who may not be +familiar with specific terminologies. To address this, we present a knowledge +graph-based paper search engine for biomedical research to enhance the user +experience in discovering relevant queries and articles. The system, dubbed +DiscoverPath, employs Named Entity Recognition (NER) and part-of-speech (POS) +tagging to extract terminologies and relationships from article abstracts to +create a KG. To reduce information overload, DiscoverPath presents users with a +focused subgraph containing the queried entity and its neighboring nodes and +incorporates a query recommendation system, enabling users to iteratively +refine their queries. The system is equipped with an accessible Graphical User +Interface that provides an intuitive visualization of the KG, query +recommendations, and detailed article information, enabling efficient article +retrieval, thus fostering interdisciplinary knowledge exploration. DiscoverPath +is open-sourced at https://github.com/ynchuang/DiscoverPath. + +
+
+
+
+
+ + ☆ Marginalized Importance Sampling for Off-Environment Policy Evaluation + + +
+ Reinforcement Learning (RL) methods are typically sample-inefficient, making +it challenging to train and deploy RL-policies in real world robots. Even a +robust policy trained in simulation, requires a real-world deployment to assess +their performance. This paper proposes a new approach to evaluate the +real-world performance of agent policies without deploying them in the real +world. The proposed approach incorporates a simulator along with real-world +offline data to evaluate the performance of any policy using the framework of +Marginalized Importance Sampling (MIS). Existing MIS methods face two +challenges: (1) large density ratios that deviate from a reasonable range and +(2) indirect supervision, where the ratio needs to be inferred indirectly, thus +exacerbating estimation error. Our approach addresses these challenges by +introducing the target policy's occupancy in the simulator as an intermediate +variable and learning the density ratio as the product of two terms that can be +learned separately. The first term is learned with direct supervision and the +second term has a small magnitude, thus making it easier to run. We analyze the +sample complexity as well as error propagation of our two step-procedure. +Furthermore, we empirically evaluate our approach on Sim2Sim environments such +as Cartpole, Reacher and Half-Cheetah. Our results show that our method +generalizes well across a variety of Sim2Sim gap, target policies and offline +data collection policies. We also demonstrate the performance of our algorithm +on a Sim2Real task of validating the performance of a 7 DOF robotic arm using +offline data along with a gazebo based arm simulator. + +
+
+
+
+
+ + ☆ Asymmetric matrix sensing by gradient descent with small random + initialization + + +
+ We study matrix sensing, which is the problem of reconstructing a low-rank +matrix from a few linear measurements. It can be formulated as an +overparameterized regression problem, which can be solved by factorized +gradient descent when starting from a small random initialization. + Linear neural networks, and in particular matrix sensing by factorized +gradient descent, serve as prototypical models of non-convex problems in modern +machine learning, where complex phenomena can be disentangled and studied in +detail. Much research has been devoted to studying special cases of asymmetric +matrix sensing, such as asymmetric matrix factorization and symmetric positive +semi-definite matrix sensing. + Our key contribution is introducing a continuous differential equation that +we call the $\textit{perturbed gradient flow}$. We prove that the perturbed +gradient flow converges quickly to the true target matrix whenever the +perturbation is sufficiently bounded. The dynamics of gradient descent for +matrix sensing can be reduced to this formulation, yielding a novel proof of +asymmetric matrix sensing with factorized gradient descent. Compared to +directly analyzing the dynamics of gradient descent, the continuous formulation +allows bounding key quantities by considering their derivatives, often +simplifying the proofs. We believe the general proof technique may prove useful +in other settings as well. + +
+
+
+
+
+ + ☆ Composite federated learning with heterogeneous data + + +
+ We propose a novel algorithm for solving the composite Federated Learning +(FL) problem. This algorithm manages non-smooth regularization by strategically +decoupling the proximal operator and communication, and addresses client drift +without any assumptions about data similarity. Moreover, each worker uses local +updates to reduce the communication frequency with the server and transmits +only a $d$-dimensional vector per communication round. We prove that our +algorithm converges linearly to a neighborhood of the optimal solution and +demonstrate the superiority of our algorithm over state-of-the-art methods in +numerical experiments. + +
+
+
+
+
+ + ☆ Hierarchical Grammar-Induced Geometry for Data-Efficient Molecular + Property Prediction ICML 2023 + + +
+ The prediction of molecular properties is a crucial task in the field of +material and drug discovery. The potential benefits of using deep learning +techniques are reflected in the wealth of recent literature. Still, these +techniques are faced with a common challenge in practice: Labeled data are +limited by the cost of manual extraction from literature and laborious +experimentation. In this work, we propose a data-efficient property predictor +by utilizing a learnable hierarchical molecular grammar that can generate +molecules from grammar production rules. Such a grammar induces an explicit +geometry of the space of molecular graphs, which provides an informative prior +on molecular structural similarity. The property prediction is performed using +graph neural diffusion over the grammar-induced geometry. On both small and +large datasets, our evaluation shows that this approach outperforms a wide +spectrum of baselines, including supervised and pre-trained graph neural +networks. We include a detailed ablation study and further analysis of our +solution, showing its effectiveness in cases with extremely limited data. Code +is available at https://github.com/gmh14/Geo-DEG. + +
+
+ comment: 22 pages, 10 figures; ICML 2023 +
+
+
+
+
+ + ☆ ATMS: Algorithmic Trading-Guided Market Simulation + + +
+ The effective construction of an Algorithmic Trading (AT) strategy often +relies on market simulators, which remains challenging due to existing methods' +inability to adapt to the sequential and dynamic nature of trading activities. +This work fills this gap by proposing a metric to quantify market discrepancy. +This metric measures the difference between a causal effect from underlying +market unique characteristics and it is evaluated through the interaction +between the AT agent and the market. Most importantly, we introduce Algorithmic +Trading-guided Market Simulation (ATMS) by optimizing our proposed metric. +Inspired by SeqGAN, ATMS formulates the simulator as a stochastic policy in +reinforcement learning (RL) to account for the sequential nature of trading. +Moreover, ATMS utilizes the policy gradient update to bypass differentiating +the proposed metric, which involves non-differentiable operations such as order +deletion from the market. Through extensive experiments on semi-real market +data, we demonstrate the effectiveness of our metric and show that ATMS +generates market data with improved similarity to reality compared to the +state-of-the-art conditional Wasserstein Generative Adversarial Network (cWGAN) +approach. Furthermore, ATMS produces market data with more balanced BUY and +SELL volumes, mitigating the bias of the cWGAN baseline approach, where a +simple strategy can exploit the BUY/SELL imbalance for profit. + +
+
+
+
+
+ + ☆ Survival Prediction from Imbalance colorectal cancer dataset using + hybrid sampling methods and tree-based classifiers + + +
+ Background and Objective: Colorectal cancer is a high mortality cancer. +Clinical data analysis plays a crucial role in predicting the survival of +colorectal cancer patients, enabling clinicians to make informed treatment +decisions. However, utilizing clinical data can be challenging, especially when +dealing with imbalanced outcomes. This paper focuses on developing algorithms +to predict 1-, 3-, and 5-year survival of colorectal cancer patients using +clinical datasets, with particular emphasis on the highly imbalanced 1-year +survival prediction task. To address this issue, we propose a method that +creates a pipeline of some of standard balancing techniques to increase the +true positive rate. Evaluation is conducted on a colorectal cancer dataset from +the SEER database. Methods: The pre-processing step consists of removing +records with missing values and merging categories. The minority class of +1-year and 3-year survival tasks consists of 10% and 20% of the data, +respectively. Edited Nearest Neighbor, Repeated edited nearest neighbor (RENN), +Synthetic Minority Over-sampling Techniques (SMOTE), and pipelines of SMOTE and +RENN approaches were used and compared for balancing the data with tree-based +classifiers. Decision Trees, Random Forest, Extra Tree, eXtreme Gradient +Boosting, and Light Gradient Boosting (LGBM) are used in this article. Method. +Results: The performance evaluation utilizes a 5-fold cross-validation +approach. In the case of highly imbalanced datasets (1-year), our proposed +method with LGBM outperforms other sampling methods with the sensitivity of +72.30%. For the task of imbalance (3-year survival), the combination of RENN +and LGBM achieves a sensitivity of 80.81%, indicating that our proposed method +works best for highly imbalanced datasets. Conclusions: Our proposed method +significantly improves mortality prediction for the minority class of +colorectal cancer patients. + +
+
+ comment: 19 Pages, 6 Figures, 4 Tables +
+
+
+
+
+ + ☆ 3D View Prediction Models of the Dorsal Visual Stream + + +
+ Deep neural network representations align well with brain activity in the +ventral visual stream. However, the primate visual system has a distinct dorsal +processing stream with different functional properties. To test if a model +trained to perceive 3D scene geometry aligns better with neural responses in +dorsal visual areas, we trained a self-supervised geometry-aware recurrent +neural network (GRNN) to predict novel camera views using a 3D feature memory. +We compared GRNN to self-supervised baseline models that have been shown to +align well with ventral regions using the large-scale fMRI Natural Scenes +Dataset (NSD). We found that while the baseline models accounted better for +ventral brain regions, GRNN accounted for a greater proportion of variance in +dorsal brain regions. Our findings demonstrate the potential for using +task-relevant models to probe representational differences across visual +streams. + +
+
+ comment: 2023 Conference on Cognitive Computational Neuroscience +
+
+
+
+
+ + ☆ Self-concordant Smoothing for Convex Composite Optimization + + +
+ We introduce the notion of self-concordant smoothing for minimizing the sum +of two convex functions: the first is smooth and the second may be nonsmooth. +Our framework results naturally from the smoothing approximation technique +referred to as partial smoothing in which only a part of the nonsmooth function +is smoothed. The key highlight of our approach is in a natural property of the +resulting problem's structure which provides us with a variable-metric +selection method and a step-length selection rule particularly suitable for +proximal Newton-type algorithms. In addition, we efficiently handle specific +structures promoted by the nonsmooth function, such as $\ell_1$-regularization +and group-lasso penalties. We prove local quadratic convergence rates for two +resulting algorithms: Prox-N-SCORE, a proximal Newton algorithm and +Prox-GGN-SCORE, a proximal generalized Gauss-Newton (GGN) algorithm. The +Prox-GGN-SCORE algorithm highlights an important approximation procedure which +helps to significantly reduce most of the computational overhead associated +with the inverse Hessian. This approximation is essentially useful for +overparameterized machine learning models and in the mini-batch settings. +Numerical examples on both synthetic and real datasets demonstrate the +efficiency of our approach and its superiority over existing approaches. + +
+
+ comment: 37 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ Measuring, Interpreting, and Improving Fairness of Algorithms using + Causal Inference and Randomized Experiments + + +
+ Algorithm fairness has become a central problem for the broad adoption of +artificial intelligence. Although the past decade has witnessed an explosion of +excellent work studying algorithm biases, achieving fairness in real-world AI +production systems has remained a challenging task. Most existing works fail to +excel in practical applications since either they have conflicting measurement +techniques and/ or heavy assumptions, or require code-access of the production +models, whereas real systems demand an easy-to-implement measurement framework +and a systematic way to correct the detected sources of bias. + In this paper, we leverage recent advances in causal inference and +interpretable machine learning to present an algorithm-agnostic framework +(MIIF) to Measure, Interpret, and Improve the Fairness of an algorithmic +decision. We measure the algorithm bias using randomized experiments, which +enables the simultaneous measurement of disparate treatment, disparate impact, +and economic value. Furthermore, using modern interpretability techniques, we +develop an explainable machine learning model which accurately interprets and +distills the beliefs of a blackbox algorithm. Altogether, these techniques +create a simple and powerful toolset for studying algorithm fairness, +especially for understanding the cost of fairness in practical applications +like e-commerce and targeted advertising, where industry A/B testing is already +abundant. + +
+
+
+
+
+ + ☆ DRAG: Divergence-based Adaptive Aggregation in Federated learning on + Non-IID Data + + +
+ Local stochastic gradient descent (SGD) is a fundamental approach in +achieving communication efficiency in Federated Learning (FL) by allowing +individual workers to perform local updates. However, the presence of +heterogeneous data distributions across working nodes causes each worker to +update its local model towards a local optimum, leading to the phenomenon known +as ``client-drift" and resulting in slowed convergence. To address this issue, +previous works have explored methods that either introduce communication +overhead or suffer from unsteady performance. In this work, we introduce a +novel metric called ``degree of divergence," quantifying the angle between the +local gradient and the global reference direction. Leveraging this metric, we +propose the divergence-based adaptive aggregation (DRAG) algorithm, which +dynamically ``drags" the received local updates toward the reference direction +in each round without requiring extra communication overhead. Furthermore, we +establish a rigorous convergence analysis for DRAG, proving its ability to +achieve a sublinear convergence rate. Compelling experimental results are +presented to illustrate DRAG's superior performance compared to +state-of-the-art algorithms in effectively managing the client-drift +phenomenon. Additionally, DRAG exhibits remarkable resilience against certain +Byzantine attacks. By securely sharing a small sample of the client's data with +the FL server, DRAG effectively counters these attacks, as demonstrated through +comprehensive experiments. + +
+
+
+
+
+ + ☆ CONFIDERAI: a novel CONFormal Interpretable-by-Design score function + forExplainable and Reliable Artificial Intelligence + + +
+ Everyday life is increasingly influenced by artificial intelligence, and +there is no question that machine learning algorithms must be designed to be +reliable and trustworthy for everyone. Specifically, computer scientists +consider an artificial intelligence system safe and trustworthy if it fulfills +five pillars: explainability, robustness, transparency, fairness, and privacy. +In addition to these five, we propose a sixth fundamental aspect: conformity, +that is, the probabilistic assurance that the system will behave as the machine +learner expects. In this paper, we propose a methodology to link conformal +prediction with explainable machine learning by defining CONFIDERAI, a new +score function for rule-based models that leverages both rules predictive +ability and points geometrical position within rules boundaries. We also +address the problem of defining regions in the feature space where conformal +guarantees are satisfied by exploiting techniques to control the number of +non-conformal samples in conformal regions based on support vector data +description (SVDD). The overall methodology is tested with promising results on +benchmark and real datasets, such as DNS tunneling detection or cardiovascular +disease prediction. + +
+
+ comment: 12 pages, 7 figures, 1 algorithm, international journal +
+
+
+
+
+ + ☆ Gated recurrent neural networks discover attention + + +
+ Recent architectural developments have enabled recurrent neural networks +(RNNs) to reach and even surpass the performance of Transformers on certain +sequence modeling tasks. These modern RNNs feature a prominent design pattern: +linear recurrent layers interconnected by feedforward paths with multiplicative +gating. Here, we show how RNNs equipped with these two design elements can +exactly implement (linear) self-attention, the main building block of +Transformers. By reverse-engineering a set of trained RNNs, we find that +gradient descent in practice discovers our construction. In particular, we +examine RNNs trained to solve simple in-context learning tasks on which +Transformers are known to excel and find that gradient descent instills in our +RNNs the same attention-based in-context learning algorithm used by +Transformers. Our findings highlight the importance of multiplicative +interactions in neural networks and suggest that certain RNNs might be +unexpectedly implementing attention under the hood. + +
+
+
+
+
+ + ♻ ☆ Backward Curriculum Reinforcement Learning + + +
+ Current reinforcement learning algorithms train an agent using +forward-generated trajectories, which provide little guidance so that the agent +can explore as much as possible. While realizing the value of reinforcement +learning results from sufficient exploration, this approach leads to a +trade-off in losing sample efficiency, an essential factor impacting algorithm +performance. Previous tasks use reward-shaping techniques and network structure +modification to increase sample efficiency. However, these methods require many +steps to implement. In this work, we propose novel backward curriculum +reinforcement learning that begins training the agent using the backward +trajectory of the episode instead of the original forward trajectory. This +approach provides the agent with a strong reward signal, enabling more +sample-efficient learning. Moreover, our method only requires a minor change in +the algorithm of reversing the order of the trajectory before agent training, +allowing a straightforward application to any state-of-the-art algorithm. + +
+
+ comment: In the proceedings of the 32nd IEEE International Conference on Robot + and Human Interactive Communication (IEEE RO-MAN 2023) +
+
+
+
+
+ + ♻ ☆ Stochastic Configuration Machines for Industrial Artificial Intelligence + + +
+ Real-time predictive modelling with desired accuracy is highly expected in +industrial artificial intelligence (IAI), where neural networks play a key +role. Neural networks in IAI require powerful, high-performance computing +devices to operate a large number of floating point data. Based on stochastic +configuration networks (SCNs), this paper proposes a new randomized learner +model, termed stochastic configuration machines (SCMs), to stress effective +modelling and data size saving that are useful and valuable for industrial +applications. Compared to SCNs and random vector functional-link (RVFL) nets +with binarized implementation, the model storage of SCMs can be significantly +compressed while retaining favourable prediction performance. Besides the +architecture of the SCM learner model and its learning algorithm, as an +important part of this contribution, we also provide a theoretical basis on the +learning capacity of SCMs by analysing the model's complexity. Experimental +studies are carried out over some benchmark datasets and three industrial +applications. The results demonstrate that SCM has great potential for dealing +with industrial data analytics. + +
+
+ comment: 23 pages, 7 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Dataset of Pathloss and ToA Radio Maps With Localization Application + + +
+ In this article, we present a collection of radio map datasets in dense urban +setting, which we generated and made publicly available. The datasets include +simulated pathloss/received signal strength (RSS) and time of arrival (ToA) +radio maps over a large collection of realistic dense urban setting in real +city maps. The two main applications of the presented dataset are 1) learning +methods that predict the pathloss from input city maps (namely, deep +learning-based simulations), and, 2) wireless localization. The fact that the +RSS and ToA maps are computed by the same simulations over the same city maps +allows for a fair comparison of the RSS and ToA-based localization methods. + +
+
+
+
+
+ + ♻ ☆ A Lyapunov Theory for Finite-Sample Guarantees of Asynchronous + Q-Learning and TD-Learning Variants + + +
+ This paper develops an unified framework to study finite-sample convergence +guarantees of a large class of value-based asynchronous reinforcement learning +(RL) algorithms. We do this by first reformulating the RL algorithms as +\textit{Markovian Stochastic Approximation} (SA) algorithms to solve +fixed-point equations. We then develop a Lyapunov analysis and derive +mean-square error bounds on the convergence of the Markovian SA. Based on this +result, we establish finite-sample mean-square convergence bounds for +asynchronous RL algorithms such as $Q$-learning, $n$-step TD, TD$(\lambda)$, +and off-policy TD algorithms including V-trace. As a by-product, by analyzing +the convergence bounds of $n$-step TD and TD$(\lambda)$, we provide theoretical +insights into the bias-variance trade-off, i.e., efficiency of bootstrapping in +RL. This was first posed as an open problem in (Sutton, 1999). + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ MultiWay-Adapater: Adapting large-scale multi-modal models for scalable + image-text retrieval + + +
+ As the size of Large Multi-Modal Models (LMMs) increases consistently, the +adaptation of these pre-trained models to specialized tasks has become a +computationally and memory-intensive challenge. Traditional fine-tuning methods +require isolated, exhaustive retuning for each new task, limiting the models' +versatility. Moreover, current efficient adaptation techniques often overlook +modality alignment, focusing only on the knowledge extraction of new tasks. To +tackle these issues, we introduce Multiway-Adapter, an innovative framework +incorporating an 'Alignment Enhancer' to deepen modality alignment, enabling +high transferability without tuning pre-trained parameters. Our method adds +fewer than 1.25\% of additional parameters to LMMs, exemplified by the BEiT-3 +model in our study. This leads to superior zero-shot image-text retrieval +performance compared to fully fine-tuned models, while achieving up to a 57\% +reduction in fine-tuning time. Our approach offers a resource-efficient and +effective adaptation pathway for LMMs, broadening their applicability. The +source code is publicly available at: +\url{https://github.com/longkukuhi/MultiWay-Adapter}. + +
+
+
+
+
+ + ☆ Target-Guided Composed Image Retrieval + + +
+ Composed image retrieval (CIR) is a new and flexible image retrieval +paradigm, which can retrieve the target image for a multimodal query, including +a reference image and its corresponding modification text. Although existing +efforts have achieved compelling success, they overlook the conflict +relationship modeling between the reference image and the modification text for +improving the multimodal query composition and the adaptive matching degree +modeling for promoting the ranking of the candidate images that could present +different levels of matching degrees with the given query. To address these two +limitations, in this work, we propose a Target-Guided Composed Image Retrieval +network (TG-CIR). In particular, TG-CIR first extracts the unified global and +local attribute features for the reference/target image and the modification +text with the contrastive language-image pre-training model (CLIP) as the +backbone, where an orthogonal regularization is introduced to promote the +independence among the attribute features. Then TG-CIR designs a target-query +relationship-guided multimodal query composition module, comprising a +target-free student composition branch and a target-based teacher composition +branch, where the target-query relationship is injected into the teacher branch +for guiding the conflict relationship modeling of the student branch. Last, +apart from the conventional batch-based classification loss, TG-CIR +additionally introduces a batch-based target similarity-guided matching degree +regularization to promote the metric learning process. Extensive experiments on +three benchmark datasets demonstrate the superiority of our proposed method. + +
+
+
+
+
+ + ☆ UniSA: Unified Generative Framework for Sentiment Analysis ACM MM 2023 + + +
+ Sentiment analysis is a crucial task that aims to understand people's +emotional states and predict emotional categories based on multimodal +information. It consists of several subtasks, such as emotion recognition in +conversation (ERC), aspect-based sentiment analysis (ABSA), and multimodal +sentiment analysis (MSA). However, unifying all subtasks in sentiment analysis +presents numerous challenges, including modality alignment, unified +input/output forms, and dataset bias. To address these challenges, we propose a +Task-Specific Prompt method to jointly model subtasks and introduce a +multimodal generative framework called UniSA. Additionally, we organize the +benchmark datasets of main subtasks into a new Sentiment Analysis Evaluation +benchmark, SAEval. We design novel pre-training tasks and training methods to +enable the model to learn generic sentiment knowledge among subtasks to improve +the model's multimodal sentiment perception ability. Our experimental results +show that UniSA performs comparably to the state-of-the-art on all subtasks and +generalizes well to various subtasks in sentiment analysis. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ☆ Can I Trust Your Answer? Visually Grounded Video Question Answering + + +
+ We study visually grounded VideoQA in response to the emerging trends of +utilizing pretraining techniques for video-language understanding. +Specifically, by forcing vision-language models (VLMs) to answer questions and +simultaneously provide visual evidence, we seek to ascertain the extent to +which the predictions of such techniques are genuinely anchored in relevant +video content, versus spurious correlations from language or irrelevant visual +context. Towards this, we construct NExT-GQA -- an extension of NExT-QA with +10.5$K$ temporal grounding (or location) labels tied to the original QA pairs. +With NExT-GQA, we scrutinize a variety of state-of-the-art VLMs. Through +post-hoc attention analysis, we find that these models are weak in +substantiating the answers despite their strong QA performance. This exposes a +severe limitation of these models in making reliable predictions. As a remedy, +we further explore and suggest a video grounding mechanism via Gaussian mask +optimization and cross-modal learning. Experiments with different backbones +demonstrate that this grounding mechanism improves both video grounding and QA. +Our dataset and code are released. With these efforts, we aim to push towards +the reliability of deploying VLMs in VQA systems. + +
+
+ comment: Preprint. Data and code: https://github.com/doc-doc/NExT-GQA +
+
+
+
+
+ + ♻ ☆ Deep Video Codec Control + + +
+ Lossy video compression is commonly used when transmitting and storing video +data. Unified video codecs (e.g., H.264 or H.265) remain the de facto standard, +despite the availability of advanced (neural) compression approaches. +Transmitting videos in the face of dynamic network bandwidth conditions +requires video codecs to adapt to vastly different compression strengths. Rate +control modules augment the codec's compression such that bandwidth constraints +are satisfied and video distortion is minimized. While, both standard video +codes and their rate control modules are developed to minimize video distortion +w.r.t. human quality assessment, preserving the downstream performance of deep +vision models is not considered. In this paper, we present the first end-to-end +learnable deep video codec control considering both bandwidth constraints and +downstream vision performance, while not breaking existing standardization. We +demonstrate for two common vision tasks (semantic segmentation and optical flow +estimation) and on two different datasets that our deep codec control better +preserves downstream performance than using 2-pass average bit rate control +while meeting dynamic bandwidth constraints and adhering to standardizations. + +
+
+ comment: 22 pages, 26 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Recurrent Multi-scale Transformer for High-Resolution Salient Object + Detection ACM MM2023 + + +
+ Salient Object Detection (SOD) aims to identify and segment the most +conspicuous objects in an image or video. As an important pre-processing step, +it has many potential applications in multimedia and vision tasks. With the +advance of imaging devices, SOD with high-resolution images is of great demand, +recently. However, traditional SOD methods are largely limited to +low-resolution images, making them difficult to adapt to the development of +High-Resolution SOD (HRSOD). Although some HRSOD methods emerge, there are no +large enough datasets for training and evaluating. Besides, current HRSOD +methods generally produce incomplete object regions and irregular object +boundaries. To address above issues, in this work, we first propose a new +HRS10K dataset, which contains 10,500 high-quality annotated images at 2K-8K +resolution. As far as we know, it is the largest dataset for the HRSOD task, +which will significantly help future works in training and evaluating models. +Furthermore, to improve the HRSOD performance, we propose a novel Recurrent +Multi-scale Transformer (RMFormer), which recurrently utilizes shared +Transformers and multi-scale refinement architectures. Thus, high-resolution +saliency maps can be generated with the guidance of lower-resolution +predictions. Extensive experiments on both high-resolution and low-resolution +benchmarks show the effectiveness and superiority of the proposed framework. +The source code and dataset are released at: +https://github.com/DrowsyMon/RMFormer. + +
+
+ comment: This work is the camera-ready version of ACM MM2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 16 + +
+
+
+ + ☆ BDC-Adapter: Brownian Distance Covariance for Better Vision-Language + Reasoning BMVC 2023 + + +
+ Large-scale pre-trained Vision-Language Models (VLMs), such as CLIP and +ALIGN, have introduced a new paradigm for learning transferable visual +representations. Recently, there has been a surge of interest among researchers +in developing lightweight fine-tuning techniques to adapt these models to +downstream visual tasks. We recognize that current state-of-the-art fine-tuning +methods, such as Tip-Adapter, simply consider the covariance between the query +image feature and features of support few-shot training samples, which only +captures linear relations and potentially instigates a deceptive perception of +independence. To address this issue, in this work, we innovatively introduce +Brownian Distance Covariance (BDC) to the field of vision-language reasoning. +The BDC metric can model all possible relations, providing a robust metric for +measuring feature dependence. Based on this, we present a novel method called +BDC-Adapter, which integrates BDC prototype similarity reasoning and +multi-modal reasoning network prediction to perform classification tasks. Our +extensive experimental results show that the proposed BDC-Adapter can freely +handle non-linear relations and fully characterize independence, outperforming +the current state-of-the-art methods by large margins. + +
+
+ comment: Accepted by BMVC 2023 +
+
+
+
+
+ + ☆ Large AI Model Empowered Multimodal Semantic Communications + + +
+ Multimodal signals, including text, audio, image and video, can be integrated +into Semantic Communication (SC) for providing an immersive experience with low +latency and high quality at the semantic level. However, the multimodal SC has +several challenges, including data heterogeneity, semantic ambiguity, and +signal fading. Recent advancements in large AI models, particularly in +Multimodal Language Model (MLM) and Large Language Model (LLM), offer potential +solutions for these issues. To this end, we propose a Large AI Model-based +Multimodal SC (LAM-MSC) framework, in which we first present the MLM-based +Multimodal Alignment (MMA) that utilizes the MLM to enable the transformation +between multimodal and unimodal data while preserving semantic consistency. +Then, a personalized LLM-based Knowledge Base (LKB) is proposed, which allows +users to perform personalized semantic extraction or recovery through the LLM. +This effectively addresses the semantic ambiguity. Finally, we apply the +Conditional Generative adversarial networks-based channel Estimation (CGE) to +obtain Channel State Information (CSI). This approach effectively mitigates the +impact of fading channels in SC. Finally, we conduct simulations that +demonstrate the superior performance of the LAM-MSC framework. + +
+
+ comment: To be submitted for journal publication +
+
+
+
+
+ + ☆ Representations Matter: Embedding Modes of Large Language Models using + Dynamic Mode Decomposition + + +
+ Existing large language models (LLMs) are known for generating "hallucinated" +content, namely a fabricated text of plausibly looking, yet unfounded, facts. +To identify when these hallucination scenarios occur, we examine the properties +of the generated text in the embedding space. Specifically, we draw inspiration +from the dynamic mode decomposition (DMD) tool in analyzing the pattern +evolution of text embeddings across sentences. We empirically demonstrate how +the spectrum of sentence embeddings over paragraphs is constantly low-rank for +the generated text, unlike that of the ground-truth text. Importantly, we find +that evaluation cases having LLM hallucinations correspond to ground-truth +embedding patterns with a higher number of modes being poorly approximated by +the few modes associated with LLM embedding patterns. In analogy to near-field +electromagnetic evanescent waves, the embedding DMD eigenmodes of the generated +text with hallucinations vanishes quickly across sentences as opposed to those +of the ground-truth text. This suggests that the hallucinations result from +both the generation techniques and the underlying representation. + +
+
+
+
+
+ + ☆ Siren's Song in the AI Ocean: A Survey on Hallucination in Large + Language Models + + +
+ While large language models (LLMs) have demonstrated remarkable capabilities +across a range of downstream tasks, a significant concern revolves around their +propensity to exhibit hallucinations: LLMs occasionally generate content that +diverges from the user input, contradicts previously generated context, or +misaligns with established world knowledge. This phenomenon poses a substantial +challenge to the reliability of LLMs in real-world scenarios. In this paper, we +survey recent efforts on the detection, explanation, and mitigation of +hallucination, with an emphasis on the unique challenges posed by LLMs. We +present taxonomies of the LLM hallucination phenomena and evaluation +benchmarks, analyze existing approaches aiming at mitigating LLM hallucination, +and discuss potential directions for future research. + +
+
+ comment: work in progress; 32 pages +
+
+
+
+
+ + ☆ A Visual Interpretation-Based Self-Improved Classification System Using + Virtual Adversarial Training + + +
+ The successful application of large pre-trained models such as BERT in +natural language processing has attracted more attention from researchers. +Since the BERT typically acts as an end-to-end black box, classification +systems based on it usually have difficulty in interpretation and low +robustness. This paper proposes a visual interpretation-based self-improving +classification model with a combination of virtual adversarial training (VAT) +and BERT models to address the above problems. Specifically, a fine-tuned BERT +model is used as a classifier to classify the sentiment of the text. Then, the +predicted sentiment classification labels are used as part of the input of +another BERT for spam classification via a semi-supervised training manner +using VAT. Additionally, visualization techniques, including visualizing the +importance of words and normalizing the attention head matrix, are employed to +analyze the relevance of each component to classification accuracy. Moreover, +brand-new features will be found in the visual analysis, and classification +performance will be improved. Experimental results on Twitter's tweet dataset +demonstrate the effectiveness of the proposed model on the classification task. +Furthermore, the ablation study results illustrate the effect of different +components of the proposed model on the classification results. + +
+
+
+
+
+ + ☆ Large Language Models for Generative Recommendation: A Survey and + Visionary Discussions + + +
+ Recent years have witnessed the wide adoption of large language models (LLM) +in different fields, especially natural language processing and computer +vision. Such a trend can also be observed in recommender systems (RS). However, +most of related work treat LLM as a component of the conventional +recommendation pipeline (e.g., as a feature extractor) which may not be able to +fully leverage the generative power of LLM. Instead of separating the +recommendation process into multiple stages such as score computation and +re-ranking, this process can be simplified to one stage with LLM: directly +generating recommendations from the complete pool of items. This survey reviews +the progress, methods and future directions of LLM-based generative +recommendation by examining three questions: 1) What generative recommendation +is, 2) Why RS should advance to generative recommendation, and 3) How to +implement LLM-based generative recommendation for various RS tasks. We hope +that the survey can provide the context and guidance needed to explore this +interesting and emerging topic. + +
+
+
+
+
+ + ☆ Attention Where It Matters: Rethinking Visual Document Understanding + with Selective Region Concentration ICCV 2023 + + +
+ We propose a novel end-to-end document understanding model called SeRum +(SElective Region Understanding Model) for extracting meaningful information +from document images, including document analysis, retrieval, and office +automation. + Unlike state-of-the-art approaches that rely on multi-stage technical schemes +and are computationally expensive, + SeRum converts document image understanding and recognition tasks into a +local decoding process of the visual tokens of interest, using a content-aware +token merge module. + This mechanism enables the model to pay more attention to regions of interest +generated by the query decoder, improving the model's effectiveness and +speeding up the decoding speed of the generative scheme. + We also designed several pre-training tasks to enhance the understanding and +local awareness of the model. + Experimental results demonstrate that SeRum achieves state-of-the-art +performance on document understanding tasks and competitive results on text +spotting tasks. + SeRum represents a substantial advancement towards enabling efficient and +effective end-to-end document understanding. + +
+
+ comment: Accepted to ICCV 2023 main conference +
+
+
+
+
+ + ☆ MedChatZH: a Better Medical Adviser Learns from Better Instructions + + +
+ Generative large language models (LLMs) have shown great success in various +applications, including question-answering (QA) and dialogue systems. However, +in specialized domains like traditional Chinese medical QA, these models may +perform unsatisfactorily without fine-tuning on domain-specific datasets. To +address this, we introduce MedChatZH, a dialogue model designed specifically +for traditional Chinese medical QA. Our model is pre-trained on Chinese +traditional medical books and fine-tuned with a carefully curated medical +instruction dataset. It outperforms several solid baselines on a real-world +medical dialogue dataset. We release our model, code, and dataset on +https://github.com/tyang816/MedChatZH to facilitate further research in the +domain of traditional Chinese medicine and LLMs. + +
+
+ comment: 7 pages, 3 figures +
+
+
+
+
+ + ☆ A Study on the Implementation of Generative AI Services Using an + Enterprise Data-Based LLM Application Architecture + + +
+ This study presents a method for implementing generative AI services by +utilizing the Large Language Model (LLM) application architecture. With recent +advancements in generative AI technology, LLMs have gained prominence across +various domains. In this context, the research addresses the challenge of +information scarcity and proposes specific remedies by harnessing LLM +capabilities. The investigation delves into strategies for mitigating the issue +of inadequate data, offering tailored solutions. The study delves into the +efficacy of employing fine-tuning techniques and direct document integration to +alleviate data insufficiency. A significant contribution of this work is the +development of a Retrieval-Augmented Generation (RAG) model, which tackles the +aforementioned challenges. The RAG model is carefully designed to enhance +information storage and retrieval processes, ensuring improved content +generation. The research elucidates the key phases of the information storage +and retrieval methodology underpinned by the RAG model. A comprehensive +analysis of these steps is undertaken, emphasizing their significance in +addressing the scarcity of data. The study highlights the efficacy of the +proposed method, showcasing its applicability through illustrative instances. +By implementing the RAG model for information storage and retrieval, the +research not only contributes to a deeper comprehension of generative AI +technology but also facilitates its practical usability within enterprises +utilizing LLMs. This work holds substantial value in advancing the field of +generative AI, offering insights into enhancing data-driven content generation +and fostering active utilization of LLM-based services within corporate +settings. + +
+
+
+
+
+ + ☆ Business Process Text Sketch Automation Generation Using Large Language + Model + + +
+ Business Process Management (BPM) is gaining increasing attention as it has +the potential to cut costs while boosting output and quality. Business process +document generation is a crucial stage in BPM. However, due to a shortage of +datasets, data-driven deep learning techniques struggle to deliver the expected +results. We propose an approach to transform Conditional Process Trees (CPTs) +into Business Process Text Sketches (BPTSs) using Large Language Models (LLMs). +The traditional prompting approach (Few-shot In-Context Learning) tries to get +the correct answer in one go, and it can find the pattern of transforming +simple CPTs into BPTSs, but for close-domain and CPTs with complex hierarchy, +the traditional prompts perform weakly and with low correctness. We suggest +using this technique to break down a difficult CPT into a number of basic CPTs +and then solve each one in turn, drawing inspiration from the +divide-and-conquer strategy. We chose 100 process trees with depths ranging +from 2 to 5 at random, as well as CPTs with many nodes, many degrees of +selection, and cyclic nesting. Experiments show that our method can achieve a +correct rate of 93.42%, which is 45.17% better than traditional prompting +methods. Our proposed method provides a solution for business process document +generation in the absence of datasets, and secondly, it becomes potentially +possible to provide a large number of datasets for the process model extraction +(PME) domain. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Integrating Image Features with Convolutional Sequence-to-sequence + Network for Multilingual Visual Question Answering SP2022 + + +
+ Visual Question Answering (VQA) is a task that requires computers to give +correct answers for the input questions based on the images. This task can be +solved by humans with ease but is a challenge for computers. The +VLSP2022-EVJVQA shared task carries the Visual Question Answering task in the +multilingual domain on a newly released dataset: UIT-EVJVQA, in which the +questions and answers are written in three different languages: English, +Vietnamese and Japanese. We approached the challenge as a sequence-to-sequence +learning task, in which we integrated hints from pre-trained state-of-the-art +VQA models and image features with Convolutional Sequence-to-Sequence network +to generate the desired answers. Our results obtained up to 0.3442 by F1 score +on the public test set, 0.4210 on the private test set, and placed 3rd in the +competition. + +
+
+ comment: VLSP2022-EVJVQA +
+
+
+
+
+ + ♻ ☆ Can Programming Languages Boost Each Other via Instruction Tuning? + + +
+ When human programmers have mastered a programming language, it would be +easier when they learn a new programming language. In this report, we focus on +exploring whether programming languages can boost each other during the +instruction fine-tuning phase of code large language models. We conduct +extensive experiments of 8 popular programming languages (Python, JavaScript, +TypeScript, C, C++, Java, Go, HTML) on StarCoder. Results demonstrate that +programming languages can significantly improve each other. For example, +CodeM-Python 15B trained on Python is able to increase Java by an absolute +17.95% pass@1 on HumanEval-X. More surprisingly, we found that CodeM-HTML 7B +trained on the HTML corpus can improve Java by an absolute 15.24% pass@1. Our +training data is released at https://github.com/NL2Code/CodeM. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Research without Re-search: Maximal Update Parametrization Yields + Accurate Loss Prediction across Scales + + +
+ As language models scale up, it becomes increasingly expensive to verify +research ideas because conclusions on small models do not trivially transfer to +large ones. A possible solution is to establish a generic system that directly +predicts some metrics for large models solely based on the results and +hyperparameters from small models. Existing methods based on scaling laws +require hyperparameter search on the largest models, which is impractical with +limited resources. We address this issue by presenting our discoveries +indicating that Maximal Update parametrization (Mup) enables accurate fitting +of scaling laws for hyperparameters close to common loss basins, without any +search. Thus, different models can be directly compared on large scales with +loss prediction even before the training starts. We propose a new paradigm as a +first step towards reliable academic research for any model scale without heavy +computation. Code is publicly available at +https://github.com/cofe-ai/Mu-scaling. + +
+
+ comment: Code is publicly available at https://github.com/cofe-ai/Mu-scaling +
+
+
+
+
+ + ♻ ☆ Beyond Triplet: Leveraging the Most Data for Multimodal Machine + Translation ACL 2023 + + +
+ Multimodal machine translation (MMT) aims to improve translation quality by +incorporating information from other modalities, such as vision. Previous MMT +systems mainly focus on better access and use of visual information and tend to +validate their methods on image-related datasets. These studies face two +challenges. First, they can only utilize triple data (bilingual texts with +images), which is scarce; second, current benchmarks are relatively restricted +and do not correspond to realistic scenarios. Therefore, this paper +correspondingly establishes new methods and new datasets for MMT. First, we +propose a framework 2/3-Triplet with two new approaches to enhance MMT by +utilizing large-scale non-triple data: monolingual image-text data and parallel +text-only data. Second, we construct an English-Chinese {e}-commercial +{m}ulti{m}odal {t}ranslation dataset (including training and testing), named +EMMT, where its test set is carefully selected as some words are ambiguous and +shall be translated mistakenly without the help of images. Experiments show +that our method is more suitable for real-world scenarios and can significantly +improve translation performance by using more non-triple data. In addition, our +model also rivals various SOTA models in conventional multimodal translation +benchmarks. + +
+
+ comment: 8 pages, ACL 2023 Finding +
+
+
+
+
+ + ♻ ☆ A scoping review on multimodal deep learning in biomedical images and + texts + + +
+ Computer-assisted diagnostic and prognostic systems of the future should be +capable of simultaneously processing multimodal data. Multimodal deep learning +(MDL), which involves the integration of multiple sources of data, such as +images and text, has the potential to revolutionize the analysis and +interpretation of biomedical data. However, it only caught researchers' +attention recently. To this end, there is a critical need to conduct a +systematic review on this topic, identify the limitations of current work, and +explore future directions. In this scoping review, we aim to provide a +comprehensive overview of the current state of the field and identify key +concepts, types of studies, and research gaps with a focus on biomedical images +and texts joint learning, mainly because these two were the most commonly +available data types in MDL research. This study reviewed the current uses of +multimodal deep learning on five tasks: (1) Report generation, (2) Visual +question answering, (3) Cross-modal retrieval, (4) Computer-aided diagnosis, +and (5) Semantic segmentation. Our results highlight the diverse applications +and potential of MDL and suggest directions for future research in the field. +We hope our review will facilitate the collaboration of natural language +processing (NLP) and medical imaging communities and support the next +generation of decision-making and computer-assisted diagnostic system +development. + +
+
+ comment: This paper has been accepted by the Journal of Biomedical Informatics +
+
+
+
+
+ + ♻ ☆ CausalLM is not optimal for in-context learning + + +
+ Recent empirical evidence indicates that transformer based in-context +learning performs better when using a prefix language model (prefixLM), in +which in-context samples can all attend to each other, compared to causal +language models (causalLM), which use auto-regressive attention that prohibits +in-context samples to attend to future samples. While this result is intuitive, +it is not understood from a theoretical perspective. In this paper we take a +theoretical approach and analyze the convergence behavior of prefixLM and +causalLM under a certain parameter construction. Our analysis shows that both +LM types converge to their stationary points at a linear rate, but that while +prefixLM converges to the optimal solution of linear regression, causalLM +convergence dynamics follows that of an online gradient descent algorithm, +which is not guaranteed to be optimal even as the number of samples grows +infinitely. We supplement our theoretical claims with empirical experiments +over synthetic and real tasks and using various types of transformers. Our +experiments verify that causalLM consistently underperforms prefixLM in all +settings. + +
+
+
+
+
+
+
+
+ + Information Retrieval 4 + +
+
+
+ + ☆ Pre-trained Neural Recommenders: A Transferable Zero-Shot Framework for + Recommendation Systems + + +
+ Modern neural collaborative filtering techniques are critical to the success +of e-commerce, social media, and content-sharing platforms. However, despite +technical advances -- for every new application domain, we need to train an NCF +model from scratch. In contrast, pre-trained vision and language models are +routinely applied to diverse applications directly (zero-shot) or with limited +fine-tuning. Inspired by the impact of pre-trained models, we explore the +possibility of pre-trained recommender models that support building recommender +systems in new domains, with minimal or no retraining, without the use of any +auxiliary user or item information. Zero-shot recommendation without auxiliary +information is challenging because we cannot form associations between users +and items across datasets when there are no overlapping users or items. Our +fundamental insight is that the statistical characteristics of the user-item +interaction matrix are universally available across different domains and +datasets. Thus, we use the statistical characteristics of the user-item +interaction matrix to identify dataset-independent representations for users +and items. We show how to learn universal (i.e., supporting zero-shot +adaptation without user or item auxiliary information) representations for +nodes and edges from the bipartite user-item interaction graph. We learn +representations by exploiting the statistical properties of the interaction +data, including user and item marginals, and the size and density distributions +of their clusters. + +
+
+
+
+
+ + ☆ Large Language Models for Generative Recommendation: A Survey and + Visionary Discussions + + +
+ Recent years have witnessed the wide adoption of large language models (LLM) +in different fields, especially natural language processing and computer +vision. Such a trend can also be observed in recommender systems (RS). However, +most of related work treat LLM as a component of the conventional +recommendation pipeline (e.g., as a feature extractor) which may not be able to +fully leverage the generative power of LLM. Instead of separating the +recommendation process into multiple stages such as score computation and +re-ranking, this process can be simplified to one stage with LLM: directly +generating recommendations from the complete pool of items. This survey reviews +the progress, methods and future directions of LLM-based generative +recommendation by examining three questions: 1) What generative recommendation +is, 2) Why RS should advance to generative recommendation, and 3) How to +implement LLM-based generative recommendation for various RS tasks. We hope +that the survey can provide the context and guidance needed to explore this +interesting and emerging topic. + +
+
+
+
+
+ + ☆ Multi-Relational Contrastive Learning for Recommendation RecSys 2023 + + +
+ Personalized recommender systems play a crucial role in capturing users' +evolving preferences over time to provide accurate and effective +recommendations on various online platforms. However, many recommendation +models rely on a single type of behavior learning, which limits their ability +to represent the complex relationships between users and items in real-life +scenarios. In such situations, users interact with items in multiple ways, +including clicking, tagging as favorite, reviewing, and purchasing. To address +this issue, we propose the Relation-aware Contrastive Learning (RCL) framework, +which effectively models dynamic interaction heterogeneity. The RCL model +incorporates a multi-relational graph encoder that captures short-term +preference heterogeneity while preserving the dedicated relation semantics for +different types of user-item interactions. Moreover, we design a dynamic +cross-relational memory network that enables the RCL model to capture users' +long-term multi-behavior preferences and the underlying evolving cross-type +behavior dependencies over time. To obtain robust and informative user +representations with both commonality and diversity across multi-behavior +interactions, we introduce a multi-relational contrastive learning paradigm +with heterogeneous short- and long-term interest modeling. Our extensive +experimental studies on several real-world datasets demonstrate the superiority +of the RCL recommender system over various state-of-the-art baselines in terms +of recommendation accuracy and effectiveness. + +
+
+ comment: This paper has been published as a full paper at RecSys 2023 +
+
+
+
+
+ + ♻ ☆ Empowering Long-tail Item Recommendation through Cross Decoupling + Network (CDN) KDD 2023 + + +
+ Industry recommender systems usually suffer from highly-skewed long-tail item +distributions where a small fraction of the items receives most of the user +feedback. This skew hurts recommender quality especially for the item slices +without much user feedback. While there have been many research advances made +in academia, deploying these methods in production is very difficult and very +few improvements have been made in industry. One challenge is that these +methods often hurt overall performance; additionally, they could be complex and +expensive to train and serve. In this work, we aim to improve tail item +recommendations while maintaining the overall performance with less training +and serving cost. We first find that the predictions of user preferences are +biased under long-tail distributions. The bias comes from the differences +between training and serving data in two perspectives: 1) the item +distributions, and 2) user's preference given an item. Most existing methods +mainly attempt to reduce the bias from the item distribution perspective, +ignoring the discrepancy from user preference given an item. This leads to a +severe forgetting issue and results in sub-optimal performance. + To address the problem, we design a novel Cross Decoupling Network (CDN) (i) +decouples the learning process of memorization and generalization on the item +side through a mixture-of-expert architecture; (ii) decouples the user samples +from different distributions through a regularized bilateral branch network. +Finally, a new adapter is introduced to aggregate the decoupled vectors, and +softly shift the training attention to tail items. Extensive experimental +results show that CDN significantly outperforms state-of-the-art approaches on +benchmark datasets. We also demonstrate its effectiveness by a case study of +CDN in a large-scale recommendation system at Google. + +
+
+ comment: Accepted by KDD 2023 Applied Data Science (ADS) track +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ MAGMA: Music Aligned Generative Motion Autodecoder + + +
+ Mapping music to dance is a challenging problem that requires spatial and +temporal coherence along with a continual synchronization with the music's +progression. Taking inspiration from large language models, we introduce a +2-step approach for generating dance using a Vector Quantized-Variational +Autoencoder (VQ-VAE) to distill motion into primitives and train a Transformer +decoder to learn the correct sequencing of these primitives. We also evaluate +the importance of music representations by comparing naive music feature +extraction using Librosa to deep audio representations generated by +state-of-the-art audio compression algorithms. Additionally, we train +variations of the motion generator using relative and absolute positional +encodings to determine the effect on generated motion quality when generating +arbitrarily long sequence lengths. Our proposed approach achieve +state-of-the-art results in music-to-motion generation benchmarks and enables +the real-time generation of considerably longer motion sequences, the ability +to chain multiple motion sequences seamlessly, and easy customization of motion +sequences to meet style requirements. + +
+
+
+
+
+ + ☆ Turn Fake into Real: Adversarial Head Turn Attacks Against Deepfake + Detection + + +
+ Malicious use of deepfakes leads to serious public concerns and reduces +people's trust in digital media. Although effective deepfake detectors have +been proposed, they are substantially vulnerable to adversarial attacks. To +evaluate the detector's robustness, recent studies have explored various +attacks. However, all existing attacks are limited to 2D image perturbations, +which are hard to translate into real-world facial changes. In this paper, we +propose adversarial head turn (AdvHeat), the first attempt at 3D adversarial +face views against deepfake detectors, based on face view synthesis from a +single-view fake image. Extensive experiments validate the vulnerability of +various detectors to AdvHeat in realistic, black-box scenarios. For example, +AdvHeat based on a simple random search yields a high attack success rate of +96.8% with 360 searching steps. When additional query access is allowed, we can +further reduce the step budget to 50. Additional analyses demonstrate that +AdvHeat is better than conventional attacks on both the cross-detector +transferability and robustness to defenses. The adversarial images generated by +AdvHeat are also shown to have natural looks. Our code, including that for +generating a multi-view dataset consisting of 360 synthetic views for each of +1000 IDs from FaceForensics++, is available at +https://github.com/twowwj/AdvHeaT. + +
+
+
+
+
+ + ♻ ☆ Faked Speech Detection with Zero Knowledge + + +
+ Audio is one of the most used ways of human communication, but at the same +time it can be easily misused to trick people. With the revolution of AI, the +related technologies are now accessible to almost everyone thus making it +simple for the criminals to commit crimes and forgeries. In this work, we +introduce a neural network method to develop a classifier that will blindly +classify an input audio as real or mimicked; the word 'blindly' refers to the +ability to detect mimicked audio without references or real sources. The +proposed model was trained on a set of important features extracted from a +large dataset of audios to get a classifier that was tested on the same set of +features from different audios. The data was extracted from two raw datasets, +especially composed for this work; an all English dataset and a mixed dataset +(Arabic plus English). These datasets have been made available, in raw form, +through GitHub for the use of the research community at +https://github.com/SaSs7/Dataset. For the purpose of comparison, the audios +were also classified through human inspection with the subjects being the +native speakers. The ensued results were interesting and exhibited formidable +accuracy. + +
+
+ comment: 14 pages, 4 figures (6 if you count subfigures), 2 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 18 + +
+
+
+ + ☆ Explainability for Large Language Models: A Survey + + +
+ Large language models (LLMs) have demonstrated impressive capabilities in +natural language processing. However, their internal mechanisms are still +unclear and this lack of transparency poses unwanted risks for downstream +applications. Therefore, understanding and explaining these models is crucial +for elucidating their behaviors, limitations, and social impacts. In this +paper, we introduce a taxonomy of explainability techniques and provide a +structured overview of methods for explaining Transformer-based language +models. We categorize techniques based on the training paradigms of LLMs: +traditional fine-tuning-based paradigm and prompting-based paradigm. For each +paradigm, we summarize the goals and dominant approaches for generating local +explanations of individual predictions and global explanations of overall model +knowledge. We also discuss metrics for evaluating generated explanations, and +discuss how explanations can be leveraged to debug models and improve +performance. Lastly, we examine key challenges and emerging opportunities for +explanation techniques in the era of LLMs in comparison to conventional machine +learning models. + +
+
+
+
+
+ + ☆ Zero-Shot Recommendations with Pre-Trained Large Language Models for + Multimodal Nudging + + +
+ We present a method for zero-shot recommendation of multimodal non-stationary +content that leverages recent advancements in the field of generative AI. We +propose rendering inputs of different modalities as textual descriptions and to +utilize pre-trained LLMs to obtain their numerical representations by computing +semantic embeddings. Once unified representations of all content items are +obtained, the recommendation can be performed by computing an appropriate +similarity metric between them without any additional learning. We demonstrate +our approach on a synthetic multimodal nudging environment, where the inputs +consist of tabular, textual, and visual data. + +
+
+
+
+
+ + ☆ ModelScope-Agent: Building Your Customizable Agent System with + Open-source Large Language Models + + +
+ Large language models (LLMs) have recently demonstrated remarkable +capabilities to comprehend human intentions, engage in reasoning, and design +planning-like behavior. To further unleash the power of LLMs to accomplish +complex tasks, there is a growing trend to build agent framework that equips +LLMs, such as ChatGPT, with tool-use abilities to connect with massive external +APIs. In this work, we introduce ModelScope-Agent, a general and customizable +agent framework for real-world applications, based on open-source LLMs as +controllers. It provides a user-friendly system library, with customizable +engine design to support model training on multiple open-source LLMs, while +also enabling seamless integration with both model APIs and common APIs in a +unified way. To equip the LLMs with tool-use abilities, a comprehensive +framework has been proposed spanning over tool-use data collection, tool +retrieval, tool registration, memory control, customized model training, and +evaluation for practical real-world applications. Finally, we showcase +ModelScopeGPT, a real-world intelligent assistant of ModelScope Community based +on the ModelScope-Agent framework, which is able to connect open-source LLMs +with more than 1000 public AI models and localized community knowledge in +ModelScope. The ModelScope-Agent +library\footnote{https://github.com/modelscope/modelscope-agent} and online +demo\footnote{https://modelscope.cn/studios/damo/ModelScopeGPT/summary} are now +publicly available. + +
+
+
+
+
+ + ☆ Bridge Diffusion Model: bridge non-English language-native text-to-image + diffusion model with English communities + + +
+ Text-to-Image generation (TTI) technologies are advancing rapidly, especially +in the English language communities. However, English-native TTI models +inherently carry biases from English world centric training data, which creates +a dilemma for development of other language-native TTI models. One common +choice is fine-tuning the English-native TTI model with translated samples from +non-English communities. It falls short of fully addressing the model bias +problem. Alternatively, training non-English language native models from +scratch can effectively resolve the English world bias, but diverges from the +English TTI communities, thus not able to utilize the strides continuously +gaining in the English TTI communities any more. To build non-English language +native TTI model meanwhile keep compatability with the English TTI communities, +we propose a novel model structure referred as "Bridge Diffusion Model" (BDM). +The proposed BDM employs a backbone-branch network structure to learn the +non-English language semantics while keep the latent space compatible with the +English-native TTI backbone, in an end-to-end manner. The unique advantages of +the proposed BDM are that it's not only adept at generating images that +precisely depict non-English language semantics, but also compatible with +various English-native TTI plugins, such as different checkpoints, LoRA, +ControlNet, Dreambooth, and Textual Inversion, etc. Moreover, BDM can +concurrently generate content seamlessly combining both non-English native and +English-native semantics within a single image, fostering cultural interaction. +We verify our method by applying BDM to build a Chinese-native TTI model, +whereas the method is generic and applicable to any other language. + +
+
+
+
+
+ + ☆ Multilingual Text Representation + + +
+ Modern NLP breakthrough includes large multilingual models capable of +performing tasks across more than 100 languages. State-of-the-art language +models came a long way, starting from the simple one-hot representation of +words capable of performing tasks like natural language understanding, +common-sense reasoning, or question-answering, thus capturing both the syntax +and semantics of texts. At the same time, language models are expanding beyond +our known language boundary, even competitively performing over very +low-resource dialects of endangered languages. However, there are still +problems to solve to ensure an equitable representation of texts through a +unified modeling space across language and speakers. In this survey, we shed +light on this iterative progression of multilingual text representation and +discuss the driving factors that ultimately led to the current +state-of-the-art. Subsequently, we discuss how the full potential of language +democratization could be obtained, reaching beyond the known limits and what is +the scope of improvement in that space. + +
+
+ comment: PhD Comprehensive exam report +
+
+
+
+
+ + ☆ Knowledge Graph Embeddings for Multi-Lingual Structured Representations + of Radiology Reports + + +
+ The way we analyse clinical texts has undergone major changes over the last +years. The introduction of language models such as BERT led to adaptations for +the (bio)medical domain like PubMedBERT and ClinicalBERT. These models rely on +large databases of archived medical documents. While performing well in terms +of accuracy, both the lack of interpretability and limitations to transfer +across languages limit their use in clinical setting. We introduce a novel +light-weight graph-based embedding method specifically catering radiology +reports. It takes into account the structure and composition of the report, +while also connecting medical terms in the report through the multi-lingual +SNOMED Clinical Terms knowledge base. The resulting graph embedding uncovers +the underlying relationships among clinical terms, achieving a representation +that is better understandable for clinicians and clinically more accurate, +without reliance on large pre-training datasets. We show the use of this +embedding on two tasks namely disease classification of X-ray reports and image +classification. For disease classification our model is competitive with its +BERT-based counterparts, while being magnitudes smaller in size and training +data requirements. For image classification, we show the effectiveness of the +graph embedding leveraging cross-modal knowledge transfer and show how this +method is usable across different languages. + +
+
+
+
+
+ + ☆ BLSP: Bootstrapping Language-Speech Pre-training via Behavior Alignment + of Continuation Writing + + +
+ The emergence of large language models (LLMs) has sparked significant +interest in extending their remarkable language capabilities to speech. +However, modality alignment between speech and text still remains an open +problem. Current solutions can be categorized into two strategies. One is a +cascaded approach where outputs (tokens or states) of a separately trained +speech recognition system are used as inputs for LLMs, which limits their +potential in modeling alignment between speech and text. The other is an +end-to-end approach that relies on speech instruction data, which is very +difficult to collect in large quantities. In this paper, we address these +issues and propose the BLSP approach that Bootstraps Language-Speech +Pre-training via behavior alignment of continuation writing. We achieve this by +learning a lightweight modality adapter between a frozen speech encoder and an +LLM, ensuring that the LLM exhibits the same generation behavior regardless of +the modality of input: a speech segment or its transcript. The training process +can be divided into two steps. The first step prompts an LLM to generate texts +with speech transcripts as prefixes, obtaining text continuations. In the +second step, these continuations are used as supervised signals to train the +modality adapter in an end-to-end manner. We demonstrate that this +straightforward process can extend the capabilities of LLMs to speech, enabling +speech recognition, speech translation, spoken language understanding, and +speech conversation, even in zero-shot cross-lingual scenarios. + +
+
+
+
+
+ + ☆ Evaluating Transformer's Ability to Learn Mildly Context-Sensitive + Languages + + +
+ Despite that Transformers perform well in NLP tasks, recent studies suggest +that self-attention is theoretically limited in learning even some regular and +context-free languages. These findings motivated us to think about their +implications in modeling natural language, which is hypothesized to be mildly +context-sensitive. We test Transformer's ability to learn a variety of mildly +context-sensitive languages of varying complexities, and find that they +generalize well to unseen in-distribution data, but their ability to +extrapolate to longer strings is worse than that of LSTMs. Our analyses show +that the learned self-attention patterns and representations modeled dependency +relations and demonstrated counting behavior, which may have helped the models +solve the languages. + +
+
+
+
+
+ + ☆ LeanContext: Cost-Efficient Domain-Specific Question Answering Using + LLMs + + +
+ Question-answering (QA) is a significant application of Large Language Models +(LLMs), shaping chatbot capabilities across healthcare, education, and customer +service. However, widespread LLM integration presents a challenge for small +businesses due to the high expenses of LLM API usage. Costs rise rapidly when +domain-specific data (context) is used alongside queries for accurate +domain-specific LLM responses. One option is to summarize the context by using +LLMs and reduce the context. However, this can also filter out useful +information that is necessary to answer some domain-specific queries. In this +paper, we shift from human-oriented summarizers to AI model-friendly summaries. +Our approach, LeanContext, efficiently extracts $k$ key sentences from the +context that are closely aligned with the query. The choice of $k$ is neither +static nor random; we introduce a reinforcement learning technique that +dynamically determines $k$ based on the query and context. The rest of the less +important sentences are reduced using a free open source text reduction method. +We evaluate LeanContext against several recent query-aware and query-unaware +context reduction approaches on prominent datasets (arxiv papers and BBC news +articles). Despite cost reductions of $37.29\%$ to $67.81\%$, LeanContext's +ROUGE-1 score decreases only by $1.41\%$ to $2.65\%$ compared to a baseline +that retains the entire context (no summarization). Additionally, if free +pretrained LLM-based summarizers are used to reduce context (into human +consumable summaries), LeanContext can further modify the reduced context to +enhance the accuracy (ROUGE-1 score) by $13.22\%$ to $24.61\%$. + +
+
+ comment: The paper is under review +
+
+
+
+
+ + ☆ LinkTransformer: A Unified Package for Record Linkage with Transformer + Language Models + + +
+ Linking information across sources is fundamental to a variety of analyses in +social science, business, and government. While large language models (LLMs) +offer enormous promise for improving record linkage in noisy datasets, in many +domains approximate string matching packages in popular softwares such as R and +Stata remain predominant. These packages have clean, simple interfaces and can +be easily extended to a diversity of languages. Our open-source package +LinkTransformer aims to extend the familiarity and ease-of-use of popular +string matching methods to deep learning. It is a general purpose package for +record linkage with transformer LLMs that treats record linkage as a text +retrieval problem. At its core is an off-the-shelf toolkit for applying +transformer models to record linkage with four lines of code. LinkTransformer +contains a rich repository of pre-trained transformer semantic similarity +models for multiple languages and supports easy integration of any transformer +language model from Hugging Face or OpenAI. It supports standard functionality +such as blocking and linking on multiple noisy fields. LinkTransformer APIs +also perform other common text data processing tasks, e.g., aggregation, noisy +de-duplication, and translation-free cross-lingual linkage. Importantly, +LinkTransformer also contains comprehensive tools for efficient model tuning, +to facilitate different levels of customization when off-the-shelf models do +not provide the required accuracy. Finally, to promote reusability, +reproducibility, and extensibility, LinkTransformer makes it easy for users to +contribute their custom-trained models to its model hub. By combining +transformer language models with intuitive APIs that will be familiar to many +users of popular string matching packages, LinkTransformer aims to democratize +the benefits of LLMs among those who may be less familiar with deep learning +frameworks. + +
+
+
+
+
+ + ☆ Value Kaleidoscope: Engaging AI with Pluralistic Human Values, Rights, + and Duties + + +
+ Human values are crucial to human decision-making. Value pluralism is the +view that multiple correct values may be held in tension with one another +(e.g., when considering lying to a friend to protect their feelings, how does +one balance honesty with friendship?). As statistical learners, AI systems fit +to averages by default, washing out these potentially irreducible value +conflicts. To improve AI systems to better reflect value pluralism, the +first-order challenge is to explore the extent to which AI systems can model +pluralistic human values, rights, and duties as well as their interaction. + We introduce ValuePrism, a large-scale dataset of 218k values, rights, and +duties connected to 31k human-written situations. ValuePrism's contextualized +values are generated by GPT-4 and deemed high-quality by human annotators 91% +of the time. We conduct a large-scale study with annotators across diverse +social and demographic backgrounds to try to understand whose values are +represented. + With ValuePrism, we build Kaleido, an open, light-weight, and structured +language-based multi-task model that generates, explains, and assesses the +relevance and valence (i.e., support or oppose) of human values, rights, and +duties within a specific context. Humans prefer the sets of values output by +our system over the teacher GPT-4, finding them more accurate and with broader +coverage. In addition, we demonstrate that Kaleido can help explain variability +in human decision-making by outputting contrasting values. Finally, we show +that Kaleido's representations transfer to other philosophical frameworks and +datasets, confirming the benefit of an explicit, modular, and interpretable +approach to value pluralism. We hope that our work will serve as a step to +making more explicit the implicit values behind human decision-making and to +steering AI systems to make decisions that are more in accordance with them. + +
+
+
+
+
+ + ☆ Bias and Fairness in Large Language Models: A Survey + + +
+ Rapid advancements of large language models (LLMs) have enabled the +processing, understanding, and generation of human-like text, with increasing +integration into systems that touch our social sphere. Despite this success, +these models can learn, perpetuate, and amplify harmful social biases. In this +paper, we present a comprehensive survey of bias evaluation and mitigation +techniques for LLMs. We first consolidate, formalize, and expand notions of +social bias and fairness in natural language processing, defining distinct +facets of harm and introducing several desiderata to operationalize fairness +for LLMs. We then unify the literature by proposing three intuitive taxonomies, +two for bias evaluation, namely metrics and datasets, and one for mitigation. +Our first taxonomy of metrics for bias evaluation disambiguates the +relationship between metrics and evaluation datasets, and organizes metrics by +the different levels at which they operate in a model: embeddings, +probabilities, and generated text. Our second taxonomy of datasets for bias +evaluation categorizes datasets by their structure as counterfactual inputs or +prompts, and identifies the targeted harms and social groups; we also release a +consolidation of publicly-available datasets for improved access. Our third +taxonomy of techniques for bias mitigation classifies methods by their +intervention during pre-processing, in-training, intra-processing, and +post-processing, with granular subcategories that elucidate research trends. +Finally, we identify open problems and challenges for future work. Synthesizing +a wide range of recent research, we aim to provide a clear guide of the +existing literature that empowers researchers and practitioners to better +understand and prevent the propagation of bias in LLMs. + +
+
+
+
+
+ + ♻ ☆ Combing for Credentials: Active Pattern Extraction from Smart Reply + + +
+ Pre-trained large language models, such as GPT\nobreakdash-2 and BERT, are +often fine-tuned to achieve state-of-the-art performance on a downstream task. +One natural example is the ``Smart Reply'' application where a pre-trained +model is tuned to provide suggested responses for a given query message. Since +the tuning data is often sensitive data such as emails or chat transcripts, it +is important to understand and mitigate the risk that the model leaks its +tuning data. We investigate potential information leakage vulnerabilities in a +typical Smart Reply pipeline. We consider a realistic setting where the +adversary can only interact with the underlying model through a front-end +interface that constrains what types of queries can be sent to the model. +Previous attacks do not work in these settings, but require the ability to send +unconstrained queries directly to the model. Even when there are no constraints +on the queries, previous attacks typically require thousands, or even millions, +of queries to extract useful information, while our attacks can extract +sensitive data in just a handful of queries. We introduce a new type of active +extraction attack that exploits canonical patterns in text containing sensitive +data. We show experimentally that it is possible for an adversary to extract +sensitive user information present in the training data, even in realistic +settings where all interactions with the model must go through a front-end that +limits the types of queries. We explore potential mitigation strategies and +demonstrate empirically how differential privacy appears to be a reasonably +effective defense mechanism to such pattern extraction attacks. + +
+
+
+
+
+ + ♻ ☆ Semantic Representations of Mathematical Expressions in a Continuous + Vector Space + + +
+ Mathematical notation makes up a large portion of STEM literature, yet +finding semantic representations for formulae remains a challenging problem. +Because mathematical notation is precise, and its meaning changes significantly +with small character shifts, the methods that work for natural text do not +necessarily work well for mathematical expressions. This work describes an +approach for representing mathematical expressions in a continuous vector +space. We use the encoder of a sequence-to-sequence architecture, trained on +visually different but mathematically equivalent expressions, to generate +vector representations (or embeddings). We compare this approach with a +structural approach that considers visual layout to embed an expression and +show that our proposed approach is better at capturing mathematical semantics. +Finally, to expedite future research, we publish a corpus of equivalent +transcendental and algebraic expression pairs. + +
+
+ comment: Transactions on Machine Learning Research (TMLR), September 2023 +
+
+
+
+
+ + ♻ ☆ AutoAlign: Fully Automatic and Effective Knowledge Graph Alignment + enabled by Large Language Models + + +
+ The task of entity alignment between knowledge graphs (KGs) aims to identify +every pair of entities from two different KGs that represent the same entity. +Many machine learning-based methods have been proposed for this task. However, +to our best knowledge, existing methods all require manually crafted seed +alignments, which are expensive to obtain. In this paper, we propose the first +fully automatic alignment method named AutoAlign, which does not require any +manually crafted seed alignments. Specifically, for predicate embeddings, +AutoAlign constructs a predicate-proximity-graph with the help of large +language models to automatically capture the similarity between predicates +across two KGs. For entity embeddings, AutoAlign first computes the entity +embeddings of each KG independently using TransE, and then shifts the two KGs' +entity embeddings into the same vector space by computing the similarity +between entities based on their attributes. Thus, both predicate alignment and +entity alignment can be done without manually crafted seed alignments. +AutoAlign is not only fully automatic, but also highly effective. Experiments +using real-world KGs show that AutoAlign improves the performance of entity +alignment significantly compared to state-of-the-art methods. + +
+
+ comment: 14 pages, 5 figures, 4 tables. arXiv admin note: substantial text + overlap with arXiv:2210.08540 +
+
+
+
+
+ + ♻ ☆ ACTI at EVALITA 2023: Overview of the Conspiracy Theory Identification + Task + + +
+ Conspiracy Theory Identication task is a new shared task proposed for the +first time at the Evalita 2023. The ACTI challenge, based exclusively on +comments published on conspiratorial channels of telegram, is divided into two +subtasks: (i) Conspiratorial Content Classification: identifying conspiratorial +content and (ii) Conspiratorial Category Classification about specific +conspiracy theory classification. A total of fifteen teams participated in the +task for a total of 81 submissions. We illustrate the best performing +approaches were based on the utilization of large language models. We finally +draw conclusions about the utilization of these models for counteracting the +spreading of misinformation in online platforms. + +
+
+ comment: Accepted at the Evalita Workshop 2023 +
+
+
+
+
+ + ♻ ☆ Social media mining for toxicovigilance of prescription medications: + End-to-end pipeline, challenges and future work + + +
+ Substance use, substance use disorder, and overdoses related to substance use +are major public health problems globally and in the United States. A key +aspect of addressing these problems from a public health standpoint is improved +surveillance. Traditional surveillance systems are laggy, and social media are +potentially useful sources of timely data. However, mining knowledge from +social media is challenging, and requires the development of advanced +artificial intelligence, specifically natural language processing (NLP) and +machine learning methods. We developed a sophisticated end-to-end pipeline for +mining information about nonmedical prescription medication use from social +media, namely Twitter and Reddit. Our pipeline employs supervised machine +learning and NLP for filtering out noise and characterizing the chatter. In +this paper, we describe our end-to-end pipeline developed over four years. In +addition to describing our data mining infrastructure, we discuss existing +challenges in social media mining for toxicovigilance, and possible future +research directions. + +
+
+
+
+
+ + ♻ ☆ UniDoc: A Universal Large Multimodal Model for Simultaneous Text + Detection, Recognition, Spotting and Understanding + + +
+ In the era of Large Language Models (LLMs), tremendous strides have been made +in the field of multimodal understanding. However, existing advanced algorithms +are limited to effectively utilizing the immense representation capabilities +and rich world knowledge inherent to these large pre-trained models, and the +beneficial connections among tasks within the context of text-rich scenarios +have not been sufficiently explored. In this work, we introduce UniDoc, a novel +multimodal model equipped with text detection and recognition capabilities, +which are deficient in existing approaches. Moreover, UniDoc capitalizes on the +beneficial interactions among tasks to enhance the performance of each +individual task. To implement UniDoc, we perform unified multimodal instruct +tuning on the contributed large-scale instruction following datasets. +Quantitative and qualitative experimental results show that UniDoc sets +state-of-the-art scores across multiple challenging benchmarks. To the best of +our knowledge, this is the first large multimodal model capable of simultaneous +text detection, recognition, spotting, and understanding. + +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Hessian-aware Quantized Node Embeddings for Recommendation + + +
+ Graph Neural Networks (GNNs) have achieved state-of-the-art performance in +recommender systems. Nevertheless, the process of searching and ranking from a +large item corpus usually requires high latency, which limits the widespread +deployment of GNNs in industry-scale applications. To address this issue, many +methods compress user/item representations into the binary embedding space to +reduce space requirements and accelerate inference. Also, they use the +Straight-through Estimator (STE) to prevent vanishing gradients during +back-propagation. However, the STE often causes the gradient mismatch problem, +leading to sub-optimal results. + In this work, we present the Hessian-aware Quantized GNN (HQ-GNN) as an +effective solution for discrete representations of users/items that enable fast +retrieval. HQ-GNN is composed of two components: a GNN encoder for learning +continuous node embeddings and a quantized module for compressing +full-precision embeddings into low-bit ones. Consequently, HQ-GNN benefits from +both lower memory requirements and faster inference speeds compared to vanilla +GNNs. To address the gradient mismatch problem in STE, we further consider the +quantized errors and its second-order derivatives for better stability. The +experimental results on several large-scale datasets show that HQ-GNN achieves +a good balance between latency and performance. + +
+
+
+
+
+ + ☆ Zero-Shot Recommendations with Pre-Trained Large Language Models for + Multimodal Nudging + + +
+ We present a method for zero-shot recommendation of multimodal non-stationary +content that leverages recent advancements in the field of generative AI. We +propose rendering inputs of different modalities as textual descriptions and to +utilize pre-trained LLMs to obtain their numerical representations by computing +semantic embeddings. Once unified representations of all content items are +obtained, the recommendation can be performed by computing an appropriate +similarity metric between them without any additional learning. We demonstrate +our approach on a synthetic multimodal nudging environment, where the inputs +consist of tabular, textual, and visual data. + +
+
+
+
+
+ + ☆ MPTopic: Improving topic modeling via Masked Permuted pre-training ECIR 2024 + + +
+ Topic modeling is pivotal in discerning hidden semantic structures within +texts, thereby generating meaningful descriptive keywords. While innovative +techniques like BERTopic and Top2Vec have recently emerged in the forefront, +they manifest certain limitations. Our analysis indicates that these methods +might not prioritize the refinement of their clustering mechanism, potentially +compromising the quality of derived topic clusters. To illustrate, Top2Vec +designates the centroids of clustering results to represent topics, whereas +BERTopic harnesses C-TF-IDF for its topic extraction.In response to these +challenges, we introduce "TF-RDF" (Term Frequency - Relative Document +Frequency), a distinctive approach to assess the relevance of terms within a +document. Building on the strengths of TF-RDF, we present MPTopic, a clustering +algorithm intrinsically driven by the insights of TF-RDF. Through comprehensive +evaluation, it is evident that the topic keywords identified with the synergy +of MPTopic and TF-RDF outperform those extracted by both BERTopic and Top2Vec. + +
+
+ comment: 12 pages, will submit to ECIR 2024 +
+
+
+
+
+ + ☆ Pure Message Passing Can Estimate Common Neighbor for Link Prediction + + +
+ Message Passing Neural Networks (MPNNs) have emerged as the {\em de facto} +standard in graph representation learning. However, when it comes to link +prediction, they often struggle, surpassed by simple heuristics such as Common +Neighbor (CN). This discrepancy stems from a fundamental limitation: while +MPNNs excel in node-level representation, they stumble with encoding the joint +structural features essential to link prediction, like CN. To bridge this gap, +we posit that, by harnessing the orthogonality of input vectors, pure +message-passing can indeed capture joint structural features. Specifically, we +study the proficiency of MPNNs in approximating CN heuristics. Based on our +findings, we introduce the Message Passing Link Predictor (MPLP), a novel link +prediction model. MPLP taps into quasi-orthogonal vectors to estimate +link-level structural features, all while preserving the node-level +complexities. Moreover, our approach demonstrates that leveraging +message-passing to capture structural features could offset MPNNs' +expressiveness limitations at the expense of estimation variance. We conduct +experiments on benchmark datasets from various domains, where our method +consistently outperforms the baseline methods. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ From Specific to Generic Learned Sorted Set Dictionaries: A + Theoretically Sound Paradigm Yelding Competitive Data Structural Boosters in + Practice + + +
+ This research concerns Learned Data Structures, a recent area that has +emerged at the crossroad of Machine Learning and Classic Data Structures. It is +methodologically important and with a high practical impact. We focus on +Learned Indexes, i.e., Learned Sorted Set Dictionaries. The proposals available +so far are specific in the sense that they can boost, indeed impressively, the +time performance of Table Search Procedures with a sorted layout only, e.g., +Binary Search. We propose a novel paradigm that, complementing known +specialized ones, can produce Learned versions of any Sorted Set Dictionary, +for instance, Balanced Binary Search Trees or Binary Search on layouts other +that sorted, i.e., Eytzinger. Theoretically, based on it, we obtain several +results of interest, such as (a) the first Learned Optimum Binary Search +Forest, with mean access time bounded by the Entropy of the probability +distribution of the accesses to the Dictionary; (b) the first Learned Sorted +Set Dictionary that, in the Dynamic Case and in an amortized analysis setting, +matches the same time bounds known for Classic Dictionaries. This latter under +widely accepted assumptions regarding the size of the Universe. The +experimental part, somewhat complex in terms of software development, clearly +indicates the nonobvious finding that the generalization we propose can yield +effective and competitive Learned Data Structural Booster, even with respect to +specific benchmark models. + +
+
+
+
+
+ + ☆ Content Prompting: Modeling Content Provider Dynamics to Improve User + Welfare in Recommender Ecosystems + + +
+ Users derive value from a recommender system (RS) only to the extent that it +is able to surface content (or items) that meet their needs/preferences. While +RSs often have a comprehensive view of user preferences across the entire user +base, content providers, by contrast, generally have only a local view of the +preferences of users that have interacted with their content. This limits a +provider's ability to offer new content to best serve the broader population. +In this work, we tackle this information asymmetry with content prompting +policies. A content prompt is a hint or suggestion to a provider to make +available novel content for which the RS predicts unmet user demand. A +prompting policy is a sequence of such prompts that is responsive to the +dynamics of a provider's beliefs, skills and incentives. We aim to determine a +joint prompting policy that induces a set of providers to make content +available that optimizes user social welfare in equilibrium, while respecting +the incentives of the providers themselves. Our contributions include: (i) an +abstract model of the RS ecosystem, including content provider behaviors, that +supports such prompting; (ii) the design and theoretical analysis of sequential +prompting policies for individual providers; (iii) a mixed integer programming +formulation for optimal joint prompting using path planning in content space; +and (iv) simple, proof-of-concept experiments illustrating how such policies +improve ecosystem health and user welfare. + +
+
+
+
+
+ + ☆ LeanContext: Cost-Efficient Domain-Specific Question Answering Using + LLMs + + +
+ Question-answering (QA) is a significant application of Large Language Models +(LLMs), shaping chatbot capabilities across healthcare, education, and customer +service. However, widespread LLM integration presents a challenge for small +businesses due to the high expenses of LLM API usage. Costs rise rapidly when +domain-specific data (context) is used alongside queries for accurate +domain-specific LLM responses. One option is to summarize the context by using +LLMs and reduce the context. However, this can also filter out useful +information that is necessary to answer some domain-specific queries. In this +paper, we shift from human-oriented summarizers to AI model-friendly summaries. +Our approach, LeanContext, efficiently extracts $k$ key sentences from the +context that are closely aligned with the query. The choice of $k$ is neither +static nor random; we introduce a reinforcement learning technique that +dynamically determines $k$ based on the query and context. The rest of the less +important sentences are reduced using a free open source text reduction method. +We evaluate LeanContext against several recent query-aware and query-unaware +context reduction approaches on prominent datasets (arxiv papers and BBC news +articles). Despite cost reductions of $37.29\%$ to $67.81\%$, LeanContext's +ROUGE-1 score decreases only by $1.41\%$ to $2.65\%$ compared to a baseline +that retains the entire context (no summarization). Additionally, if free +pretrained LLM-based summarizers are used to reduce context (into human +consumable summaries), LeanContext can further modify the reduced context to +enhance the accuracy (ROUGE-1 score) by $13.22\%$ to $24.61\%$. + +
+
+ comment: The paper is under review +
+
+
+
+
+ + ♻ ☆ AutoAlign: Fully Automatic and Effective Knowledge Graph Alignment + enabled by Large Language Models + + +
+ The task of entity alignment between knowledge graphs (KGs) aims to identify +every pair of entities from two different KGs that represent the same entity. +Many machine learning-based methods have been proposed for this task. However, +to our best knowledge, existing methods all require manually crafted seed +alignments, which are expensive to obtain. In this paper, we propose the first +fully automatic alignment method named AutoAlign, which does not require any +manually crafted seed alignments. Specifically, for predicate embeddings, +AutoAlign constructs a predicate-proximity-graph with the help of large +language models to automatically capture the similarity between predicates +across two KGs. For entity embeddings, AutoAlign first computes the entity +embeddings of each KG independently using TransE, and then shifts the two KGs' +entity embeddings into the same vector space by computing the similarity +between entities based on their attributes. Thus, both predicate alignment and +entity alignment can be done without manually crafted seed alignments. +AutoAlign is not only fully automatic, but also highly effective. Experiments +using real-world KGs show that AutoAlign improves the performance of entity +alignment significantly compared to state-of-the-art methods. + +
+
+ comment: 14 pages, 5 figures, 4 tables. arXiv admin note: substantial text + overlap with arXiv:2210.08540 +
+
+
+
+
+ + ♻ ☆ Co-evolving Vector Quantization for ID-based Recommendation + + +
+ Category information plays a crucial role in enhancing the quality and +personalization of recommendations. Nevertheless, the availability of item +category information is not consistently present, particularly in the context +of ID-based recommendations. In this work, we propose an alternative approach +to automatically learn and generate entity (i.e., user and item) categorical +information at different levels of granularity, specifically for ID-based +recommendation. Specifically, we devise a co-evolving vector quantization +framework, namely COVE, which enables the simultaneous learning and refinement +of code representation and entity embedding in an end-to-end manner, starting +from the randomly initialized states. With its high adaptability, COVE can be +easily integrated into existing recommendation models. We validate the +effectiveness of COVE on various recommendation tasks including list +completion, collaborative filtering, and click-through rate prediction, across +different recommendation models. We will publish the code and data for other +researchers to reproduce our work. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Zero-Shot Recommendations with Pre-Trained Large Language Models for + Multimodal Nudging + + +
+ We present a method for zero-shot recommendation of multimodal non-stationary +content that leverages recent advancements in the field of generative AI. We +propose rendering inputs of different modalities as textual descriptions and to +utilize pre-trained LLMs to obtain their numerical representations by computing +semantic embeddings. Once unified representations of all content items are +obtained, the recommendation can be performed by computing an appropriate +similarity metric between them without any additional learning. We demonstrate +our approach on a synthetic multimodal nudging environment, where the inputs +consist of tabular, textual, and visual data. + +
+
+
+
+
+ + ♻ ☆ Multi-View Class Incremental Learning + + +
+ Multi-view learning (MVL) has gained great success in integrating information +from multiple perspectives of a dataset to improve downstream task performance. +To make MVL methods more practical in an open-ended environment, this paper +investigates a novel paradigm called multi-view class incremental learning +(MVCIL), where a single model incrementally classifies new classes from a +continual stream of views, requiring no access to earlier views of data. +However, MVCIL is challenged by the catastrophic forgetting of old information +and the interference with learning new concepts. To address this, we first +develop a randomization-based representation learning technique serving for +feature extraction to guarantee their separate view-optimal working states, +during which multiple views belonging to a class are presented sequentially; +Then, we integrate them one by one in the orthogonality fusion subspace spanned +by the extracted features; Finally, we introduce selective weight consolidation +for learning-without-forgetting decision-making while encountering new classes. +Extensive experiments on synthetic and real-world datasets validate the +effectiveness of our approach. + +
+
+ comment: 22 pages,4 figures. Preprint submitted to Information Fusion +
+
+
+
+
+ + ♻ ☆ Adding Conditional Control to Text-to-Image Diffusion Models + + +
+ We present ControlNet, a neural network architecture to add spatial +conditioning controls to large, pretrained text-to-image diffusion models. +ControlNet locks the production-ready large diffusion models, and reuses their +deep and robust encoding layers pretrained with billions of images as a strong +backbone to learn a diverse set of conditional controls. The neural +architecture is connected with "zero convolutions" (zero-initialized +convolution layers) that progressively grow the parameters from zero and ensure +that no harmful noise could affect the finetuning. We test various conditioning +controls, eg, edges, depth, segmentation, human pose, etc, with Stable +Diffusion, using single or multiple conditions, with or without prompts. We +show that the training of ControlNets is robust with small (<50k) and large +(>1m) datasets. Extensive results show that ControlNet may facilitate wider +applications to control image diffusion models. + +
+
+ comment: Codes and Supplementary Material: + https://github.com/lllyasviel/ControlNet +
+
+
+
+
+ + ♻ ☆ Distilled Low Rank Neural Radiance Field with Quantization for Light + Field Compression + + +
+ In this paper, we propose a novel light field compression method based on a +Quantized Distilled Low Rank Neural Radiance Field (QDLR-NeRF) representation. +While existing compression methods encode the set of light field sub-aperture +images, our proposed method instead learns an implicit scene representation in +the form of a Neural Radiance Field (NeRF), which also enables view synthesis. +For reducing its size, the model is first learned under a Low Rank (LR) +constraint using a Tensor Train (TT) decomposition in an Alternating Direction +Method of Multipliers (ADMM) optimization framework. To further reduce the +model size, the components of the tensor train decomposition need to be +quantized. However, performing the optimization of the NeRF model by +simultaneously taking the low rank constraint and the rate-constrained weight +quantization into consideration is challenging. To deal with this difficulty, +we introduce a network distillation operation that separates the low rank +approximation and the weight quantization in the network training. The +information from the initial LR constrained NeRF (LR-NeRF) is distilled to a +model of a much smaller dimension (DLR-NeRF) based on the TT decomposition of +the LR-NeRF. An optimized global codebook is then learned to quantize all TT +components, producing the final QDLRNeRF. Experimental results show that our +proposed method yields better compression efficiency compared with +state-of-the-art methods, and it additionally has the advantage of allowing the +synthesis of any light field view with a high quality. + +
+
+ comment: The explanation of this paper lacks many details and is not well + organized, we withdraw it to avoid misleading readers +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 48 + +
+
+
+ + ☆ Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D + Understanding, Generation, and Instruction Following + + +
+ We introduce Point-Bind, a 3D multi-modality model aligning point clouds with +2D image, language, audio, and video. Guided by ImageBind, we construct a joint +embedding space between 3D and multi-modalities, enabling many promising +applications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D +open-world understanding. On top of this, we further present Point-LLM, the +first 3D large language model (LLM) following 3D multi-modal instructions. By +parameter-efficient fine-tuning techniques, Point-LLM injects the semantics of +Point-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction +data, but exhibits superior 3D and multi-modal question-answering capacity. We +hope our work may cast a light on the community for extending 3D point clouds +to multi-modality applications. Code is available at +https://github.com/ZiyuGuo99/Point-Bind_Point-LLM. + +
+
+ comment: Work in progress. Code is available at + https://github.com/ZiyuGuo99/Point-Bind_Point-LLM +
+
+
+
+
+ + ☆ Baseline Defenses for Adversarial Attacks Against Aligned Language + Models + + +
+ As Large Language Models quickly become ubiquitous, their security +vulnerabilities are critical to understand. Recent work shows that text +optimizers can produce jailbreaking prompts that bypass moderation and +alignment. Drawing from the rich body of work on adversarial machine learning, +we approach these attacks with three questions: What threat models are +practically useful in this domain? How do baseline defense techniques perform +in this new domain? How does LLM security differ from computer vision? + We evaluate several baseline defense strategies against leading adversarial +attacks on LLMs, discussing the various settings in which each is feasible and +effective. Particularly, we look at three types of defenses: detection +(perplexity based), input preprocessing (paraphrase and retokenization), and +adversarial training. We discuss white-box and gray-box settings and discuss +the robustness-performance trade-off for each of the defenses considered. +Surprisingly, we find much more success with filtering and preprocessing than +we would expect from other domains, such as vision, providing a first +indication that the relative strengths of these defenses may be weighed +differently in these domains. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ CPSP: Learning Speech Concepts From Phoneme Supervision + + +
+ For fine-grained generation and recognition tasks such as +minimally-supervised text-to-speech (TTS), voice conversion (VC), and automatic +speech recognition (ASR), the intermediate representation extracted from speech +should contain information that is between text coding and acoustic coding. The +linguistic content is salient, while the paralinguistic information such as +speaker identity and acoustic details should be removed. However, existing +methods for extracting fine-grained intermediate representations from speech +suffer from issues of excessive redundancy and dimension explosion. +Additionally, existing contrastive learning methods in the audio field focus on +extracting global descriptive information for downstream audio classification +tasks, making them unsuitable for TTS, VC, and ASR tasks. To address these +issues, we propose a method named Contrastive Phoneme-Speech Pretraining +(CPSP), which uses three encoders, one decoder, and contrastive learning to +bring phoneme and speech into a joint multimodal space, learning how to connect +phoneme and speech at the frame level. The CPSP model is trained on 210k speech +and phoneme text pairs, achieving minimally-supervised TTS, VC, and ASR. The +proposed CPSP method offers a promising solution for fine-grained generation +and recognition downstream tasks in speech processing. We provide a website +with audio samples. + +
+
+
+
+
+ + ☆ Satisfiability Checking of Multi-Variable TPTL with Unilateral Intervals + Is PSPACE-Complete + + +
+ We investigate the decidability of the ${0,\infty}$ fragment of Timed +Propositional Temporal Logic (TPTL). We show that the satisfiability checking +of TPTL$^{0,\infty}$ is PSPACE-complete. Moreover, even its 1-variable fragment +(1-TPTL$^{0,\infty}$) is strictly more expressive than Metric Interval Temporal +Logic (MITL) for which satisfiability checking is EXPSPACE complete. Hence, we +have a strictly more expressive logic with computationally easier +satisfiability checking. To the best of our knowledge, TPTL$^{0,\infty}$ is the +first multi-variable fragment of TPTL for which satisfiability checking is +decidable without imposing any bounds/restrictions on the timed words (e.g. +bounded variability, bounded time, etc.). The membership in PSPACE is obtained +by a reduction to the emptiness checking problem for a new "non-punctual" +subclass of Alternating Timed Automata with multiple clocks called Unilateral +Very Weak Alternating Timed Automata (VWATA$^{0,\infty}$) which we prove to be +in PSPACE. We show this by constructing a simulation equivalent +non-deterministic timed automata whose number of clocks is polynomial in the +size of the given VWATA$^{0,\infty}$. + +
+
+ comment: Accepted in Concur 2023 +
+
+
+
+
+ + ☆ BatchPrompt: Accomplish more with less + + +
+ Many LLMs are trained to perform zero-shot or few-shot inference using +instruction-based prompts. Crafting prompts for these LLMs typically requires +the user to provide a detailed task description, examples of context and +completion, and single example of context for inference. This regular prompt +baseline is referred to as SinglePrompt in this paper. However, for NLP tasks +where each data point for inference is not necessarily lengthy, the token count +for instructions and few-shot examples in the prompt may be considerably larger +than that of the data point, resulting in lower token-resource utilization +compared with encoder-based models like fine-tuned BERT. This cost-efficiency +issue, affecting inference speed and compute budget, counteracts the many +benefits LLMs have to offer. This paper aims to alleviate the preceding problem +by batching multiple data points into a single prompt, a prompting strategy we +refer to as BatchPrompt. This strategy increases the density of data points, +which in turn leads to improved token utilization. Applying BatchPrompt +naively, however, is very challenging due to significant performance +degradation, as observed in our experiments. We also noticed varying inference +outcomes for the same data point appearing in different positions within a +prompt. To address the quality issue while remain high token-resource +utilization, we introduce Batch Permutation and Ensembling for BatchPrompt, a +simple way that recovers labeling quality through majority votes from data +points placed in varying positions in a batch at the price of more token usage. +To counterbalance the additional token usage caused by the voting process, we +further propose Self-reflection-guided EArly Stopping, which can terminate the +voting process early for data points the LLM confidently handles. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ Long-Term Memorability On Advertisements + + +
+ Marketers spend billions of dollars on advertisements but to what end? At the +purchase time, if customers cannot recognize a brand for which they saw an ad, +the money spent on the ad is essentially wasted. Despite its importance in +marketing, until now, there has been no study on the memorability of ads in the +ML literature. Most studies have been conducted on short-term recall (<5 mins) +on specific content types like object and action videos. On the other hand, the +advertising industry only cares about long-term memorability (a few hours or +longer), and advertisements are almost always highly multimodal, depicting a +story through its different modalities (text, images, and videos). With this +motivation, we conduct the first large scale memorability study consisting of +1203 participants and 2205 ads covering 276 brands. Running statistical tests +over different participant subpopulations and ad-types, we find many +interesting insights into what makes an ad memorable - both content and human +factors. For example, we find that brands which use commercials with fast +moving scenes are more memorable than those with slower scenes (p=8e-10) and +that people who use ad-blockers remember lower number of ads than those who +don't (p=5e-3). Further, with the motivation of simulating the memorability of +marketing materials for a particular audience, ultimately helping create one, +we present a novel model, Sharingan, trained to leverage real-world knowledge +of LLMs and visual knowledge of visual encoders to predict the memorability of +a content. We test our model on all the prominent memorability datasets in +literature (both images and videos) and achieve state of the art across all of +them. We conduct extensive ablation studies across memory types, modality, +brand, and architectural choices to find insights into what drives memory. + +
+
+
+
+
+ + ☆ When Do Discourse Markers Affect Computational Sentence Understanding? + + +
+ The capabilities and use cases of automatic natural language processing (NLP) +have grown significantly over the last few years. While much work has been +devoted to understanding how humans deal with discourse connectives, this +phenomenon is understudied in computational systems. Therefore, it is important +to put NLP models under the microscope and examine whether they can adequately +comprehend, process, and reason within the complexity of natural language. In +this chapter, we introduce the main mechanisms behind automatic sentence +processing systems step by step and then focus on evaluating discourse +connective processing. We assess nine popular systems in their ability to +understand English discourse connectives and analyze how context and language +understanding tasks affect their connective comprehension. The results show +that NLP systems do not process all discourse connectives equally well and that +the computational processing complexity of different connective kinds is not +always consistently in line with the presumed complexity order found in human +processing. In addition, while humans are more inclined to be influenced during +the reading procedure but not necessarily in the final comprehension +performance, discourse connectives have a significant impact on the final +accuracy of NLP systems. The richer knowledge of connectives a system learns, +the more negative effect inappropriate connectives have on it. This suggests +that the correct explicitation of discourse connectives is important for +computational natural language processing. + +
+
+ comment: Chapter 7 of Discourse Markers in Interaction, published in Trends in + Linguistics. Studies and Monographs +
+
+
+
+
+ + ☆ Large Content And Behavior Models To Understand, Simulate, And Optimize + Content And Behavior + + +
+ Shannon, in his seminal paper introducing information theory, divided the +communication into three levels: technical, semantic, and effectivenss. While +the technical level is concerned with accurate reconstruction of transmitted +symbols, the semantic and effectiveness levels deal with the inferred meaning +and its effect on the receiver. Thanks to telecommunications, the first level +problem has produced great advances like the internet. Large Language Models +(LLMs) make some progress towards the second goal, but the third level still +remains largely untouched. The third problem deals with predicting and +optimizing communication for desired receiver behavior. LLMs, while showing +wide generalization capabilities across a wide range of tasks, are unable to +solve for this. One reason for the underperformance could be a lack of +"behavior tokens" in LLMs' training corpora. Behavior tokens define receiver +behavior over a communication, such as shares, likes, clicks, purchases, +retweets, etc. While preprocessing data for LLM training, behavior tokens are +often removed from the corpora as noise. Therefore, in this paper, we make some +initial progress towards reintroducing behavior tokens in LLM training. The +trained models, other than showing similar performance to LLMs on content +understanding tasks, show generalization capabilities on behavior simulation, +content simulation, behavior understanding, and behavior domain adaptation. +Using a wide range of tasks on two corpora, we show results on all these +capabilities. We call these models Large Content and Behavior Models (LCBMs). +Further, to spur more research on LCBMs, we release our new Content Behavior +Corpus (CBC), a repository containing communicator, message, and corresponding +receiver behavior. + +
+
+
+
+
+ + ☆ Comparative Topic Modeling for Determinants of Divergent Report Results + Applied to Macular Degeneration Studies + + +
+ Topic modeling and text mining are subsets of Natural Language Processing +with relevance for conducting meta-analysis (MA) and systematic review (SR). +For evidence synthesis, the above NLP methods are conventionally used for +topic-specific literature searches or extracting values from reports to +automate essential phases of SR and MA. Instead, this work proposes a +comparative topic modeling approach to analyze reports of contradictory results +on the same general research question. Specifically, the objective is to find +topics exhibiting distinct associations with significant results for an outcome +of interest by ranking them according to their proportional occurrence and +consistency of distribution across reports of significant results. The proposed +method was tested on broad-scope studies addressing whether supplemental +nutritional compounds significantly benefit macular degeneration (MD). Eight +compounds were identified as having a particular association with reports of +significant results for benefitting MD. Six of these were further supported in +terms of effectiveness upon conducting a follow-up literature search for +validation (omega-3 fatty acids, copper, zeaxanthin, lutein, zinc, and +nitrates). The two not supported by the follow-up literature search (niacin and +molybdenum) also had the lowest scores under the proposed methods ranking +system, suggesting that the proposed method's score for a given topic is a +viable proxy for its degree of association with the outcome of interest. These +results underpin the proposed methods potential to add specificity in +understanding effects from broad-scope reports, elucidate topics of interest +for future research, and guide evidence synthesis in a systematic and scalable +way. + +
+
+
+
+
+ + ☆ Enhancing the vocal range of single-speaker singing voice synthesis with + melody-unsupervised pre-training + + +
+ The single-speaker singing voice synthesis (SVS) usually underperforms at +pitch values that are out of the singer's vocal range or associated with +limited training samples. Based on our previous work, this work proposes a +melody-unsupervised multi-speaker pre-training method conducted on a +multi-singer dataset to enhance the vocal range of the single-speaker, while +not degrading the timbre similarity. This pre-training method can be deployed +to a large-scale multi-singer dataset, which only contains audio-and-lyrics +pairs without phonemic timing information and pitch annotation. Specifically, +in the pre-training step, we design a phoneme predictor to produce the +frame-level phoneme probability vectors as the phonemic timing information and +a speaker encoder to model the timbre variations of different singers, and +directly estimate the frame-level f0 values from the audio to provide the pitch +information. These pre-trained model parameters are delivered into the +fine-tuning step as prior knowledge to enhance the single speaker's vocal +range. Moreover, this work also contributes to improving the sound quality and +rhythm naturalness of the synthesized singing voices. It is the first to +introduce a differentiable duration regulator to improve the rhythm naturalness +of the synthesized voice, and a bi-directional flow model to improve the sound +quality. Experimental results verify that the proposed SVS system outperforms +the baseline on both sound quality and naturalness. + +
+
+
+
+
+ + ☆ RLAIF: Scaling Reinforcement Learning from Human Feedback with AI + Feedback + + +
+ Reinforcement learning from human feedback (RLHF) is effective at aligning +large language models (LLMs) to human preferences, but gathering high quality +human preference labels is a key bottleneck. We conduct a head-to-head +comparison of RLHF vs. RL from AI Feedback (RLAIF) - a technique where +preferences are labeled by an off-the-shelf LLM in lieu of humans, and we find +that they result in similar improvements. On the task of summarization, human +evaluators prefer generations from both RLAIF and RLHF over a baseline +supervised fine-tuned model in ~70% of cases. Furthermore, when asked to rate +RLAIF vs. RLHF summaries, humans prefer both at equal rates. These results +suggest that RLAIF can yield human-level performance, offering a potential +solution to the scalability limitations of RLHF. + +
+
+
+
+
+ + ☆ Why do universal adversarial attacks work on large language models?: + Geometry might be the answer + + +
+ Transformer based large language models with emergent capabilities are +becoming increasingly ubiquitous in society. However, the task of understanding +and interpreting their internal workings, in the context of adversarial +attacks, remains largely unsolved. Gradient-based universal adversarial attacks +have been shown to be highly effective on large language models and potentially +dangerous due to their input-agnostic nature. This work presents a novel +geometric perspective explaining universal adversarial attacks on large +language models. By attacking the 117M parameter GPT-2 model, we find evidence +indicating that universal adversarial triggers could be embedding vectors which +merely approximate the semantic information in their adversarial training +region. This hypothesis is supported by white-box model analysis comprising +dimensionality reduction and similarity measurement of hidden representations. +We believe this new geometric perspective on the underlying mechanism driving +universal attacks could help us gain deeper insight into the internal workings +and failure modes of LLMs, thus enabling their mitigation. + +
+
+ comment: 2nd AdvML Frontiers Workshop at 40th International Conference on + Machine Learning, Honolulu, Hawaii, USA, 2023 +
+
+
+
+
+ + ☆ Detecting Suicidality in Arabic Tweets Using Machine Learning and Deep + Learning Techniques + + +
+ Social media platforms have revolutionized traditional communication +techniques by enabling people globally to connect instantaneously, openly, and +frequently. People use social media to share personal stories and express their +opinion. Negative emotions such as thoughts of death, self-harm, and hardship +are commonly expressed on social media, particularly among younger generations. +As a result, using social media to detect suicidal thoughts will help provide +proper intervention that will ultimately deter others from self-harm and +committing suicide and stop the spread of suicidal ideation on social media. To +investigate the ability to detect suicidal thoughts in Arabic tweets +automatically, we developed a novel Arabic suicidal tweets dataset, examined +several machine learning models, including Na\"ive Bayes, Support Vector +Machine, K-Nearest Neighbor, Random Forest, and XGBoost, trained on word +frequency and word embedding features, and investigated the ability of +pre-trained deep learning models, AraBert, AraELECTRA, and AraGPT2, to identify +suicidal thoughts in Arabic tweets. The results indicate that SVM and RF models +trained on character n-gram features provided the best performance in the +machine learning models, with 86% accuracy and an F1 score of 79%. The results +of the deep learning models show that AraBert model outperforms other machine +and deep learning models, achieving an accuracy of 91\% and an F1-score of 88%, +which significantly improves the detection of suicidal ideation in the Arabic +tweets dataset. To the best of our knowledge, this is the first study to +develop an Arabic suicidality detection dataset from Twitter and to use +deep-learning approaches in detecting suicidality in Arabic posts. + +
+
+
+
+
+ + ☆ NeuroSurgeon: A Toolkit for Subnetwork Analysis + + +
+ Despite recent advances in the field of explainability, much remains unknown +about the algorithms that neural networks learn to represent. Recent work has +attempted to understand trained models by decomposing them into functional +circuits (Csord\'as et al., 2020; Lepori et al., 2023). To advance this +research, we developed NeuroSurgeon, a python library that can be used to +discover and manipulate subnetworks within models in the Huggingface +Transformers library (Wolf et al., 2019). NeuroSurgeon is freely available at +https://github.com/mlepori1/NeuroSurgeon. + +
+
+
+
+
+ + ☆ FactLLaMA: Optimizing Instruction-Following Language Models with + External Knowledge for Automated Fact-Checking SC 2023 + + +
+ Automatic fact-checking plays a crucial role in combating the spread of +misinformation. Large Language Models (LLMs) and Instruction-Following +variants, such as InstructGPT and Alpaca, have shown remarkable performance in +various natural language processing tasks. However, their knowledge may not +always be up-to-date or sufficient, potentially leading to inaccuracies in +fact-checking. To address this limitation, we propose combining the power of +instruction-following language models with external evidence retrieval to +enhance fact-checking performance. Our approach involves leveraging search +engines to retrieve relevant evidence for a given input claim. This external +evidence serves as valuable supplementary information to augment the knowledge +of the pretrained language model. Then, we instruct-tune an open-sourced +language model, called LLaMA, using this evidence, enabling it to predict the +veracity of the input claim more accurately. To evaluate our method, we +conducted experiments on two widely used fact-checking datasets: RAWFC and +LIAR. The results demonstrate that our approach achieves state-of-the-art +performance in fact-checking tasks. By integrating external evidence, we bridge +the gap between the model's knowledge and the most up-to-date and sufficient +context available, leading to improved fact-checking outcomes. Our findings +have implications for combating misinformation and promoting the dissemination +of accurate information on online platforms. Our released materials are +accessible at: https://thcheung.github.io/factllama. + +
+
+ comment: Accepted in APSIPA ASC 2023 +
+
+
+
+
+ + ☆ ALJP: An Arabic Legal Judgment Prediction in Personal Status Cases Using + Machine Learning Models + + +
+ Legal Judgment Prediction (LJP) aims to predict judgment outcomes based on +case description. Several researchers have developed techniques to assist +potential clients by predicting the outcome in the legal profession. However, +none of the proposed techniques were implemented in Arabic, and only a few +attempts were implemented in English, Chinese, and Hindi. In this paper, we +develop a system that utilizes deep learning (DL) and natural language +processing (NLP) techniques to predict the judgment outcome from Arabic case +scripts, especially in cases of custody and annulment of marriage. This system +will assist judges and attorneys in improving their work and time efficiency +while reducing sentencing disparity. In addition, it will help litigants, +lawyers, and law students analyze the probable outcomes of any given case +before trial. We use a different machine and deep learning models such as +Support Vector Machine (SVM), Logistic regression (LR), Long Short Term Memory +(LSTM), and Bidirectional Long Short-Term Memory (BiLSTM) using representation +techniques such as TF-IDF and word2vec on the developed dataset. Experimental +results demonstrate that compared with the five baseline methods, the SVM model +with word2vec and LR with TF-IDF achieve the highest accuracy of 88% and 78% in +predicting the judgment on custody cases and annulment of marriage, +respectively. Furthermore, the LR and SVM with word2vec and BiLSTM model with +TF-IDF achieved the highest accuracy of 88% and 69% in predicting the +probability of outcomes on custody cases and annulment of marriage, +respectively. + +
+
+
+
+
+ + ☆ Publicly Shareable Clinical Large Language Model Built on Synthetic + Clinical Notes + + +
+ The development of large language models tailored for handling patients' +clinical notes is often hindered by the limited accessibility and usability of +these notes due to strict privacy regulations. To address these challenges, we +first create synthetic large-scale clinical notes using publicly available case +reports extracted from biomedical literature. We then use these synthetic notes +to train our specialized clinical large language model, Asclepius. While +Asclepius is trained on synthetic data, we assess its potential performance in +real-world applications by evaluating it using real clinical notes. We +benchmark Asclepius against several other large language models, including +GPT-3.5-turbo and other open-source alternatives. To further validate our +approach using synthetic notes, we also compare Asclepius with its variants +trained on real clinical notes. Our findings convincingly demonstrate that +synthetic clinical notes can serve as viable substitutes for real ones when +constructing high-performing clinical language models. This conclusion is +supported by detailed evaluations conducted by both GPT-4 and medical +professionals. All resources including weights, codes, and data used in the +development of Asclepius are made publicly accessible for future research. + +
+
+ comment: https://github.com/starmpcc/Asclepius +
+
+
+
+
+ + ☆ Image Hijacking: Adversarial Images can Control Generative Models at + Runtime + + +
+ Are foundation models secure from malicious actors? In this work, we focus on +the image input to a vision-language model (VLM). We discover image hijacks, +adversarial images that control generative models at runtime. We introduce +Behavior Matching, a general method for creating image hijacks, and we use it +to explore three types of attacks. Specific string attacks generate arbitrary +output of the adversary's choosing. Leak context attacks leak information from +the context window into the output. Jailbreak attacks circumvent a model's +safety training. We study these attacks against LLaVA-2, a state-of-the-art VLM +based on CLIP and LLaMA-2, and find that all our attack types have above a 90\% +success rate. Moreover, our attacks are automated and require only small image +perturbations. These findings raise serious concerns about the security of +foundation models. If image hijacks are as difficult to defend against as +adversarial examples in CIFAR-10, then it might be many years before a solution +is found -- if it even exists. + +
+
+ comment: Code is available at https://github.com/euanong/image-hijacks +
+
+
+
+
+ + ☆ JoTR: A Joint Transformer and Reinforcement Learning Framework for + Dialog Policy Learning + + +
+ Dialogue policy learning (DPL) is a crucial component of dialogue modelling. +Its primary role is to determine the appropriate abstract response, commonly +referred to as the "dialogue action". Traditional DPL methodologies have +treated this as a sequential decision problem, using pre-defined action +candidates extracted from a corpus. However, these incomplete candidates can +significantly limit the diversity of responses and pose challenges when dealing +with edge cases, which are scenarios that occur only at extreme operating +parameters. To address these limitations, we introduce a novel framework, JoTR. +This framework is unique as it leverages a text-to-text Transformer-based model +to generate flexible dialogue actions. Unlike traditional methods, JoTR +formulates a word-level policy that allows for a more dynamic and adaptable +dialogue action generation, without the need for any action templates. This +setting enhances the diversity of responses and improves the system's ability +to handle edge cases effectively. In addition, JoTR employs reinforcement +learning with a reward-shaping mechanism to efficiently finetune the word-level +dialogue policy, which allows the model to learn from its interactions, +improving its performance over time. We conducted an extensive evaluation of +JoTR to assess its effectiveness. Our extensive evaluation shows that JoTR +achieves state-of-the-art performance on two benchmark dialogue modelling +tasks, as assessed by both user simulators and human evaluators. + +
+
+ comment: Our code, models and other related resources are publicly available + at https://github.com/KwanWaiChung/JoTR +
+
+
+
+
+ + ☆ The FruitShell French synthesis system at the Blizzard 2023 Challenge + + +
+ This paper presents a French text-to-speech synthesis system for the Blizzard +Challenge 2023. The challenge consists of two tasks: generating high-quality +speech from female speakers and generating speech that closely resembles +specific individuals. Regarding the competition data, we conducted a screening +process to remove missing or erroneous text data. We organized all symbols +except for phonemes and eliminated symbols that had no pronunciation or zero +duration. Additionally, we added word boundary and start/end symbols to the +text, which we have found to improve speech quality based on our previous +experience. For the Spoke task, we performed data augmentation according to the +competition rules. We used an open-source G2P model to transcribe the French +texts into phonemes. As the G2P model uses the International Phonetic Alphabet +(IPA), we applied the same transcription process to the provided competition +data for standardization. However, due to compiler limitations in recognizing +special symbols from the IPA chart, we followed the rules to convert all +phonemes into the phonetic scheme used in the competition data. Finally, we +resampled all competition audio to a uniform sampling rate of 16 kHz. We +employed a VITS-based acoustic model with the hifigan vocoder. For the Spoke +task, we trained a multi-speaker model and incorporated speaker information +into the duration predictor, vocoder, and flow layers of the model. The +evaluation results of our system showed a quality MOS score of 3.6 for the Hub +task and 3.4 for the Spoke task, placing our system at an average level among +all participating teams. + +
+
+
+
+
+ + ☆ Towards Addressing the Misalignment of Object Proposal Evaluation for + Vision-Language Tasks via Semantic Grounding WACV 2024 + + +
+ Object proposal generation serves as a standard pre-processing step in +Vision-Language (VL) tasks (image captioning, visual question answering, etc.). +The performance of object proposals generated for VL tasks is currently +evaluated across all available annotations, a protocol that we show is +misaligned - higher scores do not necessarily correspond to improved +performance on downstream VL tasks. Our work serves as a study of this +phenomenon and explores the effectiveness of semantic grounding to mitigate its +effects. To this end, we propose evaluating object proposals against only a +subset of available annotations, selected by thresholding an annotation +importance score. Importance of object annotations to VL tasks is quantified by +extracting relevant semantic information from text describing the image. We +show that our method is consistent and demonstrates greatly improved alignment +with annotations selected by image captioning metrics and human annotation when +compared against existing techniques. Lastly, we compare current detectors used +in the Scene Graph Generation (SGG) benchmark as a use case, which serves as an +example of when traditional object proposal evaluation techniques are +misaligned. + +
+
+ comment: Accepted to WACV 2024 (Round 1) +
+
+
+
+
+ + ☆ Large Language Models for Semantic Monitoring of Corporate Disclosures: + A Case Study on Korea's Top 50 KOSPI Companies + + +
+ In the rapidly advancing domain of artificial intelligence, state-of-the-art +language models such as OpenAI's GPT-3.5-turbo and GPT-4 offer unprecedented +opportunities for automating complex tasks. This research paper delves into the +capabilities of these models for semantically analyzing corporate disclosures +in the Korean context, specifically for timely disclosure. The study focuses on +the top 50 publicly traded companies listed on the Korean KOSPI, based on +market capitalization, and scrutinizes their monthly disclosure summaries over +a period of 17 months. Each summary was assigned a sentiment rating on a scale +ranging from 1(very negative) to 5(very positive). To gauge the effectiveness +of the language models, their sentiment ratings were compared with those +generated by human experts. Our findings reveal a notable performance disparity +between GPT-3.5-turbo and GPT-4, with the latter demonstrating significant +accuracy in human evaluation tests. The Spearman correlation coefficient was +registered at 0.61, while the simple concordance rate was recorded at 0.82. +This research contributes valuable insights into the evaluative characteristics +of GPT models, thereby laying the groundwork for future innovations in the +field of automated semantic monitoring. + +
+
+
+
+
+ + ☆ Exploring the law of text geographic information + + +
+ Textual geographic information is indispensable and heavily relied upon in +practical applications. The absence of clear distribution poses challenges in +effectively harnessing geographic information, thereby driving our quest for +exploration. We contend that geographic information is influenced by human +behavior, cognition, expression, and thought processes, and given our intuitive +understanding of natural systems, we hypothesize its conformity to the Gamma +distribution. Through rigorous experiments on a diverse range of 24 datasets +encompassing different languages and types, we have substantiated this +hypothesis, unearthing the underlying regularities governing the dimensions of +quantity, length, and distance in geographic information. Furthermore, +theoretical analyses and comparisons with Gaussian distributions and Zipf's law +have refuted the contingency of these laws. Significantly, we have estimated +the upper bounds of human utilization of geographic information, pointing +towards the existence of uncharted territories. Also, we provide guidance in +geographic information extraction. Hope we peer its true countenance uncovering +the veil of geographic information. + +
+
+ comment: IPM +
+
+
+
+
+ + ☆ Will Sentiment Analysis Need Subculture? A New Data Augmentation + Approach + + +
+ The renowned proverb that "The pen is mightier than the sword" underscores +the formidable influence wielded by text expressions in shaping sentiments. +Indeed, well-crafted written can deeply resonate within cultures, conveying +profound sentiments. Nowadays, the omnipresence of the Internet has fostered a +subculture that congregates around the contemporary milieu. The subculture +artfully articulates the intricacies of human feelings by ardently pursuing the +allure of novelty, a fact that cannot be disregarded in the sentiment analysis. +This paper strives to enrich data through the lens of subculture, to address +the insufficient training data faced by sentiment analysis. To this end, a new +approach of subculture-based data augmentation (SCDA) is proposed, which +engenders six enhanced texts for each training text by leveraging the creation +of six diverse subculture expression generators. The extensive experiments +attest to the effectiveness and potential of SCDA. The results also shed light +on the phenomenon that disparate subculture expressions elicit varying degrees +of sentiment stimulation. Moreover, an intriguing conjecture arises, suggesting +the linear reversibility of certain subculture expressions. It is our fervent +aspiration that this study serves as a catalyst in fostering heightened +perceptiveness towards the tapestry of information, sentiment and culture, +thereby enriching our collective understanding. + +
+
+ comment: JASIST +
+
+
+
+
+ + ☆ Efficient RLHF: Reducing the Memory Usage of PPO + + +
+ Reinforcement Learning with Human Feedback (RLHF) has revolutionized language +modeling by aligning models with human preferences. However, the RL stage, +Proximal Policy Optimization (PPO), requires over 3x the memory of Supervised +Fine-Tuning (SFT), making it infeasible to use for most practitioners. To +address this issue, we present a comprehensive analysis the memory usage, +performance, and training time of memory-savings techniques for PPO. We +introduce Hydra-RLHF by first integrating the SFT and Reward models and then +dynamically turning LoRA "off" during training. Our experiments show: 1. Using +LoRA during PPO reduces its memory usage to be smaller than SFT while improving +alignment across four public benchmarks, and 2. Hydra-PPO reduces the latency +per sample of LoRA-PPO by up to 65% while maintaining its performance. Our +results demonstrate that Hydra-PPO is a simple and promising solution for +enabling more widespread usage of RLHF. + +
+
+
+
+
+ + ☆ Let the Models Respond: Interpreting Language Model Detoxification + Through the Lens of Prompt Dependence + + +
+ Due to language models' propensity to generate toxic or hateful responses, +several techniques were developed to align model generations with users' +preferences. Despite the effectiveness of such methods in improving the safety +of model interactions, their impact on models' internal processes is still +poorly understood. In this work, we apply popular detoxification approaches to +several language models and quantify their impact on the resulting models' +prompt dependence using feature attribution methods. We evaluate the +effectiveness of counter-narrative fine-tuning and compare it with +reinforcement learning-driven detoxification, observing differences in prompt +reliance between the two methods despite their similar detoxification +performances. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Language-Conditioned Change-point Detection to Identify Sub-Tasks in + Robotics Domains + + +
+ In this work, we present an approach to identify sub-tasks within a +demonstrated robot trajectory using language instructions. We identify these +sub-tasks using language provided during demonstrations as guidance to identify +sub-segments of a longer robot trajectory. Given a sequence of natural language +instructions and a long trajectory consisting of image frames and discrete +actions, we want to map an instruction to a smaller fragment of the trajectory. +Unlike previous instruction following works which directly learn the mapping +from language to a policy, we propose a language-conditioned change-point +detection method to identify sub-tasks in a problem. Our approach learns the +relationship between constituent segments of a long language command and +corresponding constituent segments of a trajectory. These constituent +trajectory segments can be used to learn subtasks or sub-goals for planning or +options as demonstrated by previous related work. Our insight in this work is +that the language-conditioned robot change-point detection problem is similar +to the existing video moment retrieval works used to identify sub-segments +within online videos. Through extensive experimentation, we demonstrate a +$1.78_{\pm 0.82}\%$ improvement over a baseline approach in accurately +identifying sub-tasks within a trajectory using our proposed method. Moreover, +we present a comprehensive study investigating sample complexity requirements +on learning this mapping, between language and trajectory sub-segments, to +understand if the video retrieval-based methods are realistic in real robot +scenarios. + +
+
+ comment: 9 Pages, 13 figures, Accepted paper at the RSS 2023 Workshop on + Articulate Robots: Utilizing Language for Robot Learning +
+
+
+
+
+ + ☆ Contextual Biasing of Named-Entities with Large Language Models ICASSP 2024 + + +
+ This paper studies contextual biasing with Large Language Models (LLMs), +where during second-pass rescoring additional contextual information is +provided to a LLM to boost Automatic Speech Recognition (ASR) performance. We +propose to leverage prompts for a LLM without fine tuning during rescoring +which incorporate a biasing list and few-shot examples to serve as additional +information when calculating the score for the hypothesis. In addition to +few-shot prompt learning, we propose multi-task training of the LLM to predict +both the entity class and the next token. To improve the efficiency for +contextual biasing and to avoid exceeding LLMs' maximum sequence lengths, we +propose dynamic prompting, where we select the most likely class using the +class tag prediction, and only use entities in this class as contexts for next +token prediction. Word Error Rate (WER) evaluation is performed on i) an +internal calling, messaging, and dictation dataset, and ii) the SLUE-Voxpopuli +dataset. Results indicate that biasing lists and few-shot examples can achieve +17.8% and 9.6% relative improvement compared to first pass ASR, and that +multi-task training and dynamic prompting can achieve 20.0% and 11.3% relative +WER improvement, respectively. + +
+
+ comment: 5 pages, 4 figures. Conference: ICASSP 2024 +
+
+
+
+
+ + ☆ Taken out of context: On measuring situational awareness in LLMs + + +
+ We aim to better understand the emergence of `situational awareness' in large +language models (LLMs). A model is situationally aware if it's aware that it's +a model and can recognize whether it's currently in testing or deployment. +Today's LLMs are tested for safety and alignment before they are deployed. An +LLM could exploit situational awareness to achieve a high score on safety +tests, while taking harmful actions after deployment. Situational awareness may +emerge unexpectedly as a byproduct of model scaling. One way to better foresee +this emergence is to run scaling experiments on abilities necessary for +situational awareness. As such an ability, we propose `out-of-context +reasoning' (in contrast to in-context learning). We study out-of-context +reasoning experimentally. First, we finetune an LLM on a description of a test +while providing no examples or demonstrations. At test time, we assess whether +the model can pass the test. To our surprise, we find that LLMs succeed on this +out-of-context reasoning task. Their success is sensitive to the training setup +and only works when we apply data augmentation. For both GPT-3 and LLaMA-1, +performance improves with model size. These findings offer a foundation for +further empirical study, towards predicting and potentially controlling the +emergence of situational awareness in LLMs. Code is available at: +https://github.com/AsaCooperStickland/situational-awareness-evals. + +
+
+
+
+
+ + ♻ ☆ LEVER: Learning to Verify Language-to-Code Generation with Execution ICML'23 + + +
+ The advent of large language models trained on code (code LLMs) has led to +significant progress in language-to-code generation. State-of-the-art +approaches in this area combine LLM decoding with sample pruning and reranking +using test cases or heuristics based on the execution results. However, it is +challenging to obtain test cases for many real-world language-to-code +applications, and heuristics cannot well capture the semantic features of the +execution results, such as data type and value range, which often indicates the +correctness of the program. In this work, we propose LEVER, a simple approach +to improve language-to-code generation by learning to verify the generated +programs with their execution results. Specifically, we train verifiers to +determine whether a program sampled from the LLMs is correct or not based on +the natural language input, the program itself and its execution results. The +sampled programs are reranked by combining the verification score with the LLM +generation probability, and marginalizing over programs with the same execution +results. On four datasets across the domains of table QA, math QA and basic +Python programming, LEVER consistently improves over the base code LLMs(4.6% to +10.9% with code-davinci-002) and achieves new state-of-the-art results on all +of them. + +
+
+ comment: ICML'23; code available at https://github.com/niansong1996/lever +
+
+
+
+
+ + ♻ ☆ Activation Addition: Steering Language Models Without Optimization + + +
+ Reliably controlling the behavior of large language models is a pressing open +problem. Existing methods include supervised finetuning, reinforcement learning +from human feedback, prompt engineering, and guided decoding. We instead +investigate activation engineering: modifying activations at inference time to +predictably alter model behavior. In particular, we bias the forward pass with +an added 'steering vector' implicitly specified through natural language. + Unlike past work which learned these steering vectors, our Activation +Addition (ActAdd) method computes them by taking the activation differences +that result from pairs of prompts. We demonstrate ActAdd on GPT-2 on +OpenWebText and ConceptNet. Our inference-time approach yields control over +high-level properties of output and preserves off-target model performance. It +involves far less compute and implementation effort than finetuning, allows +users to provide natural language specifications, and its overhead scales +naturally with model size. + +
+
+
+
+
+ + ♻ ☆ C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue + Evaluation ACL2023 + + +
+ Existing reference-free turn-level evaluation metrics for chatbots +inadequately capture the interaction between the user and the system. +Consequently, they often correlate poorly with human evaluations. To address +this issue, we propose a novel model-agnostic approach that leverages +Conditional Pointwise Mutual Information (C-PMI) to measure the turn-level +interaction between the system and the user based on a given evaluation +dimension. Experimental results on the widely used FED dialogue evaluation +dataset demonstrate that our approach significantly improves the correlation +with human judgment compared with existing evaluation systems. By replacing the +negative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve +a relative 62.6% higher Spearman correlation on average for the FED evaluation +metric. Our code is publicly available at https://github.com/renll/C-PMI. + +
+
+ comment: Published at ACL2023 DialDoc Workshop; Updated Results +
+
+
+
+
+ + ♻ ☆ Lingua Manga: A Generic Large Language Model Centric System for Data + Curation VLDB 2023 + + +
+ Data curation is a wide-ranging area which contains many critical but +time-consuming data processing tasks. However, the diversity of such tasks +makes it challenging to develop a general-purpose data curation system. To +address this issue, we present Lingua Manga, a user-friendly and versatile +system that utilizes pre-trained large language models. Lingua Manga offers +automatic optimization for achieving high performance and label efficiency +while facilitating flexible and rapid development. Through three example +applications with distinct objectives and users of varying levels of technical +proficiency, we demonstrate that Lingua Manga can effectively assist both +skilled programmers and low-code or even no-code users in addressing data +curation challenges. + +
+
+ comment: 4 pages, 6 figures, VLDB 2023 Demo paper +
+
+
+
+
+ + ♻ ☆ Learning to Prompt in the Classroom to Understand AI Limits: A pilot + study + + +
+ Artificial intelligence's (AI) progress holds great promise in tackling +pressing societal concerns such as health and climate. Large Language Models +(LLM) and the derived chatbots, like ChatGPT, have highly improved the natural +language processing capabilities of AI systems allowing them to process an +unprecedented amount of unstructured data. However, the ensuing excitement has +led to negative sentiments, even as AI methods demonstrate remarkable +contributions (e.g. in health and genetics). A key factor contributing to this +sentiment is the misleading perception that LLMs can effortlessly provide +solutions across domains, ignoring their limitations such as hallucinations and +reasoning constraints. Acknowledging AI fallibility is crucial to address the +impact of dogmatic overconfidence in possibly erroneous suggestions generated +by LLMs. At the same time, it can reduce fear and other negative attitudes +toward AI. This necessitates comprehensive AI literacy interventions that +educate the public about LLM constraints and effective usage techniques, i.e +prompting strategies. With this aim, a pilot educational intervention was +performed in a high school with 21 students. It involved presenting high-level +concepts about intelligence, AI, and LLMs, followed by practical exercises +involving ChatGPT in creating natural educational conversations and applying +established prompting strategies. Encouraging preliminary results emerged, +including high appreciation of the activity, improved interaction quality with +the LLM, reduced negative AI sentiments, and a better grasp of limitations, +specifically unreliability, limited understanding of commands leading to +unsatisfactory responses, and limited presentation flexibility. Our aim is to +explore AI acceptance factors and refine this approach for more controlled +future studies. + +
+
+ comment: Accepted for AIXIA 2023 22nd International Conference of the Italian + Association for Artificial Intelligence 6 - 9 Nov, 2023, Rome, Italy +
+
+
+
+
+ + ♻ ☆ Ontology Enrichment from Texts: A Biomedical Dataset for Concept + Discovery and Placement CIKM 2023 + + +
+ Mentions of new concepts appear regularly in texts and require automated +approaches to harvest and place them into Knowledge Bases (KB), e.g., +ontologies and taxonomies. Existing datasets suffer from three issues, (i) +mostly assuming that a new concept is pre-discovered and cannot support +out-of-KB mention discovery; (ii) only using the concept label as the input +along with the KB and thus lacking the contexts of a concept label; and (iii) +mostly focusing on concept placement w.r.t a taxonomy of atomic concepts, +instead of complex concepts, i.e., with logical operators. To address these +issues, we propose a new benchmark, adapting MedMentions dataset (PubMed +abstracts) with SNOMED CT versions in 2014 and 2017 under the Diseases +sub-category and the broader categories of Clinical finding, Procedure, and +Pharmaceutical / biologic product. We provide usage on the evaluation with the +dataset for out-of-KB mention discovery and concept placement, adapting recent +Large Language Model based methods. + +
+
+ comment: 5 pages, 1 figure, accepted for CIKM 2023. The dataset, data + construction scripts, and baseline implementation are available at + https://zenodo.org/record/8228005 (Zenodo) and + https://github.com/KRR-Oxford/OET (GitHub) +
+
+
+
+
+ + ♻ ☆ Constructing Holistic Measures for Social Biases in Masked Language + Models + + +
+ Masked Language Models (MLMs) have been successful in many natural language +processing tasks. However, real-world stereotype biases are likely to be +reflected in MLMs due to their learning from large text corpora. Most of the +evaluation metrics proposed in the past adopt different masking strategies, +designed with the log-likelihood of MLMs. They lack holistic considerations +such as variance for stereotype bias and anti-stereotype bias samples. In this +paper, the log-likelihoods of stereotype bias and anti-stereotype bias samples +output by MLMs are considered Gaussian distributions. Two evaluation metrics, +Kullback Leibler Divergence Score (KLDivS) and Jensen Shannon Divergence Score +(JSDivS) are proposed to evaluate social biases in MLMs The experimental +results on the public datasets StereoSet and CrowS-Pairs demonstrate that +KLDivS and JSDivS are more stable and interpretable compared to the metrics +proposed in the past. + +
+
+ comment: We need to change the methodology in the paper appropriately cause us + to change the title of the paper, so we need to withdraw it and subsequently + resubmit the new version +
+
+
+
+
+ + ♻ ☆ CLIPAG: Towards Generator-Free Text-to-Image Generation + + +
+ Perceptually Aligned Gradients (PAG) refer to an intriguing property observed +in robust image classification models, wherein their input gradients align with +human perception and pose semantic meanings. While this phenomenon has gained +significant research attention, it was solely studied in the context of +unimodal vision-only architectures. In this work, we extend the study of PAG to +Vision-Language architectures, which form the foundations for diverse +image-text tasks and applications. Through an adversarial robustification +finetuning of CLIP, we demonstrate that robust Vision-Language models exhibit +PAG in contrast to their vanilla counterparts. This work reveals the merits of +CLIP with PAG (CLIPAG) in several vision-language generative tasks. Notably, we +show that seamlessly integrating CLIPAG in a "plug-n-play" manner leads to +substantial improvements in vision-language generative applications. +Furthermore, leveraging its PAG property, CLIPAG enables text-to-image +generation without any generative model, which typically requires huge +generators. + +
+
+
+
+
+ + ♻ ☆ Minimally-Supervised Speech Synthesis with Conditional Diffusion Model + and Language Model: A Comparative Study of Semantic Coding + + +
+ Recently, there has been a growing interest in text-to-speech (TTS) methods +that can be trained with minimal supervision by combining two types of discrete +speech representations and using two sequence-to-sequence tasks to decouple +TTS. However, existing methods suffer from three problems: the high +dimensionality and waveform distortion of discrete speech representations, the +prosodic averaging problem caused by the duration prediction model in +non-autoregressive frameworks, and the information redundancy and dimension +explosion problems of existing semantic encoding methods. To address these +problems, three progressive methods are proposed. First, we propose +Diff-LM-Speech, an autoregressive structure consisting of a language model and +diffusion models, which models the semantic embedding into the mel-spectrogram +based on a diffusion model to achieve higher audio quality. We also introduce a +prompt encoder structure based on a variational autoencoder and a prosody +bottleneck to improve prompt representation ability. Second, we propose +Tetra-Diff-Speech, a non-autoregressive structure consisting of four diffusion +model-based modules that design a duration diffusion model to achieve diverse +prosodic expressions. Finally, we propose Tri-Diff-Speech, a non-autoregressive +structure consisting of three diffusion model-based modules that verify the +non-necessity of existing semantic encoding models and achieve the best +results. Experimental results show that our proposed methods outperform +baseline methods. We provide a website with audio samples. + +
+
+
+
+
+ + ♻ ☆ Speculative Decoding with Big Little Decoder + + +
+ The recent emergence of Large Language Models based on the Transformer +architecture has enabled dramatic advancements in the field of Natural Language +Processing. However, these models have long inference latency, which limits +their deployment, and which makes them prohibitively expensive for various +real-time applications. The inference latency is further exacerbated by +autoregressive generative tasks, as models need to run iteratively to generate +tokens sequentially without leveraging token-level parallelization. To address +this, we propose Big Little Decoder (BiLD), a framework that can improve +inference efficiency and latency for a wide range of text generation +applications. The BiLD framework contains two models with different sizes that +collaboratively generate text. The small model runs autoregressively to +generate text with a low inference cost, and the large model is only invoked +occasionally to refine the small model's inaccurate predictions in a +non-autoregressive manner. To coordinate the small and large models, BiLD +introduces two simple yet effective policies: (1) the fallback policy that +determines when to hand control over to the large model; and (2) the rollback +policy that determines when the large model needs to correct the small model's +inaccurate predictions. To evaluate our framework across different tasks and +models, we apply BiLD to various text generation scenarios encompassing machine +translation on IWSLT 2017 De-En and WMT 2014 De-En, and summarization on XSUM +and CNN/DailyMail. On an NVIDIA T4 GPU, our framework achieves a speedup of up +to 2.12x speedup with minimal generation quality degradation. Furthermore, our +framework is fully plug-and-play and can be applied without any modifications +in the training process or model architecture. Our code is open-sourced + +
+
+
+
+
+ + ♻ ☆ Domain-Agnostic Molecular Generation with Self-feedback + + +
+ The generation of molecules with desired properties has gained tremendous +popularity, revolutionizing the way scientists design molecular structures and +providing valuable support for chemical and drug design. However, despite the +potential of language models in molecule generation, they face numerous +challenges such as the generation of syntactically or chemically flawed +molecules, narrow domain focus, and limitations in creating diverse and +directionally feasible molecules due to a dearth of annotated data or external +molecular databases. To this end, we introduce MolGen, a pre-trained molecular +language model tailored specifically for molecule generation. MolGen acquires +intrinsic structural and grammatical insights by reconstructing over 100 +million molecular SELFIES, while facilitating knowledge transfer between +different domains through domain-agnostic molecular prefix tuning. Moreover, we +present a self-feedback paradigm that inspires the pre-trained model to align +with the ultimate goal of producing molecules with desirable properties. +Extensive experiments on well-known benchmarks confirm MolGen's optimization +capabilities, encompassing penalized logP, QED, and molecular docking +properties. Further analysis shows that MolGen can accurately capture molecule +distributions, implicitly learn their structural characteristics, and +efficiently explore chemical space. The pre-trained model, codes, and datasets +are publicly available for future research at https://github.com/zjunlp/MolGen. + +
+
+ comment: Work in progress. Add results of binding affinity +
+
+
+
+
+ + ♻ ☆ ComCLIP: Training-Free Compositional Image and Text Matching + + +
+ Contrastive Language-Image Pretraining (CLIP) has demonstrated great +zero-shot performance for matching images and text. However, it is still +challenging to adapt vision-lanaguage pretrained models like CLIP to +compositional image and text matching -- a more challenging image and text +matching task requiring the model understanding of compositional word concepts +and visual components. Towards better compositional generalization in zero-shot +image and text matching, in this paper, we study the problem from a causal +perspective: the erroneous semantics of individual entities are essentially +confounders that cause the matching failure. Therefore, we propose a novel +\textbf{\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP +disentangles input images into subjects, objects, and action sub-images and +composes CLIP's vision encoder and text encoder to perform evolving matching +over compositional text embedding and sub-image embeddings. In this way, +ComCLIP can mitigate spurious correlations introduced by the pretrained CLIP +models and dynamically evaluate the importance of each component. Experiments +on four compositional image-text matching datasets: SVO, ComVG, Winoground, and +VL-checklist, and two general image-text retrieval datasets: Flick30K, and +MSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts +the \textbf{\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even +without further training or fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Psy-LLM: Scaling up Global Mental Health Psychological Services with + AI-based Large Language Models + + +
+ The demand for psychological counselling has grown significantly in recent +years, particularly with the global outbreak of COVID-19, which has heightened +the need for timely and professional mental health support. Online +psychological counselling has emerged as the predominant mode of providing +services in response to this demand. In this study, we propose the Psy-LLM +framework, an AI-based assistive tool leveraging Large Language Models (LLMs) +for question-answering in psychological consultation settings to ease the +demand for mental health professions. Our framework combines pre-trained LLMs +with real-world professional Q\&A from psychologists and extensively crawled +psychological articles. The Psy-LLM framework serves as a front-end tool for +healthcare professionals, allowing them to provide immediate responses and +mindfulness activities to alleviate patient stress. Additionally, it functions +as a screening tool to identify urgent cases requiring further assistance. We +evaluated the framework using intrinsic metrics, such as perplexity, and +extrinsic evaluation metrics, with human participant assessments of response +helpfulness, fluency, relevance, and logic. The results demonstrate the +effectiveness of the Psy-LLM framework in generating coherent and relevant +answers to psychological questions. This article discusses the potential and +limitations of using large language models to enhance mental health support +through AI technologies. + +
+
+
+
+
+ + ♻ ☆ Breaking Language Barriers: A Question Answering Dataset for Hindi and + Marathi + + +
+ The recent advances in deep-learning have led to the development of highly +sophisticated systems with an unquenchable appetite for data. On the other +hand, building good deep-learning models for low-resource languages remains a +challenging task. This paper focuses on developing a Question Answering dataset +for two such languages- Hindi and Marathi. Despite Hindi being the 3rd most +spoken language worldwide, with 345 million speakers, and Marathi being the +11th most spoken language globally, with 83.2 million speakers, both languages +face limited resources for building efficient Question Answering systems. To +tackle the challenge of data scarcity, we have developed a novel approach for +translating the SQuAD 2.0 dataset into Hindi and Marathi. We release the +largest Question-Answering dataset available for these languages, with each +dataset containing 28,000 samples. We evaluate the dataset on various +architectures and release the best-performing models for both Hindi and +Marathi, which will facilitate further research in these languages. Leveraging +similarity tools, our method holds the potential to create datasets in diverse +languages, thereby enhancing the understanding of natural language across +varied linguistic contexts. Our fine-tuned models, code, and dataset will be +made publicly available. + +
+
+
+
+
+ + ♻ ☆ A Zipf's Law-based Text Generation Approach for Addressing Imbalance in + Entity Extraction + + +
+ Entity extraction is critical in the intelligent advancement across diverse +domains. Nevertheless, a challenge to its effectiveness arises from the data +imbalance. This paper proposes a novel approach by viewing the issue through +the quantitative information, recognizing that entities exhibit certain levels +of commonality while others are scarce, which can be reflected in the +quantifiable distribution of words. The Zipf's Law emerges as a well-suited +adoption, and to transition from words to entities, words within the documents +are classified as common and rare ones. Subsequently, sentences are classified +into common and rare ones, and are further processed by text generation models +accordingly. Rare entities within the generated sentences are then labeled +using human-designed rules, serving as a supplement to the raw dataset, thereby +mitigating the imbalance problem. The study presents a case of extracting +entities from technical documents, and experimental results from two datasets +prove the effectiveness of the proposed method. Furthermore, the significance +of Zipf's law in driving the progress of AI is discussed, broadening the reach +and coverage of Informetrics. This paper presents a successful demonstration of +extending Informetrics to interface with AI through Zipf's Law. + +
+
+ comment: Journal of Informetrics +
+
+
+
+
+ + ♻ ☆ A New Multifractal-based Deep Learning Model for Text Mining + + +
+ In this world full of uncertainty, where the fabric of existence weaves +patterns of complexity, multifractal emerges as beacons of insight, +illuminating them. As we delve into the realm of text mining that underpins +various natural language processing applications and powers a range of +intelligent services, we recognize that behind the veil of text lies a +manifestation of human thought and cognition, intricately intertwined with the +complexities. Building upon the foundation of perceiving text as a complex +system, this study embarks on a journey to unravel the hidden treasures within, +armed with the proposed multifractal method that deciphers the multifractal +attributes embedded within the text landscape. This endeavor culminates in the +birth of our novel model, which also harnesses the power of the proposed +activation function to facilitate nonlinear information transmission within its +neural network architecture. The success on experiments anchored in real-world +technical reports covering the extraction of technical term and classification +of hazard events, stands as a testament to our endeavors. This research venture +not only expands our understanding of text mining but also opens new horizons +for knowledge discovery across various domains. + +
+
+
+
+
+ + ♻ ☆ Analyzing Dataset Annotation Quality Management in the Wild + + +
+ Data quality is crucial for training accurate, unbiased, and trustworthy +machine learning models and their correct evaluation. Recent works, however, +have shown that even popular datasets used to train and evaluate +state-of-the-art models contain a non-negligible amount of erroneous +annotations, bias or annotation artifacts. There exist best practices and +guidelines regarding annotation projects. But to the best of our knowledge, no +large-scale analysis has been performed as of yet on how quality management is +actually conducted when creating natural language datasets and whether these +recommendations are followed. Therefore, we first survey and summarize +recommended quality management practices for dataset creation as described in +the literature and provide suggestions on how to apply them. Then, we compile a +corpus of 591 scientific publications introducing text datasets and annotate it +for quality-related aspects, such as annotator management, agreement, +adjudication or data validation. Using these annotations, we then analyze how +quality management is conducted in practice. We find that a majority of the +annotated publications apply good or very good quality management. However, we +deem the effort of 30% of the works as only subpar. Our analysis also shows +common errors, especially with using inter-annotator agreement and computing +annotation error rates. + +
+
+
+
+
+ + ♻ ☆ Reveal the Unknown: Out-of-Knowledge-Base Mention Discovery with Entity + Linking CIKM 2023 + + +
+ Discovering entity mentions that are out of a Knowledge Base (KB) from texts +plays a critical role in KB maintenance, but has not yet been fully explored. +The current methods are mostly limited to the simple threshold-based approach +and feature-based classification, and the datasets for evaluation are +relatively rare. We propose BLINKout, a new BERT-based Entity Linking (EL) +method which can identify mentions that do not have corresponding KB entities +by matching them to a special NIL entity. To better utilize BERT, we propose +new techniques including NIL entity representation and classification, with +synonym enhancement. We also apply KB Pruning and Versioning strategies to +automatically construct out-of-KB datasets from common in-KB EL datasets. +Results on five datasets of clinical notes, biomedical publications, and +Wikipedia articles in various domains show the advantages of BLINKout over +existing methods to identify out-of-KB mentions for the medical ontologies, +UMLS, SNOMED CT, and the general KB, WikiData. + +
+
+ comment: 11 pages, 3 figures, accepted for CIKM 2023 +
+
+
+
+
+ + ♻ ☆ SCALE: Scaling up the Complexity for Advanced Language Model Evaluation + + +
+ Recent strides in Large Language Models (LLMs) have saturated many NLP +benchmarks (even professional domain-specific ones), emphasizing the need for +novel, more challenging novel ones to properly assess LLM capabilities. In this +paper, we introduce a novel NLP benchmark that poses challenges to current LLMs +across four key dimensions: processing long documents (up to 50K tokens), +utilizing domain specific knowledge (embodied in legal texts), multilingual +understanding (covering five languages), and multitasking (comprising legal +document to document Information Retrieval, Court View Generation, Leading +Decision Summarization, Citation Extraction, and eight challenging Text +Classification tasks). Our benchmark comprises diverse legal NLP datasets from +the Swiss legal system, allowing for a comprehensive study of the underlying +Non-English, inherently multilingual, federal legal system. Despite recent +advances, efficiently processing long documents for intense review/analysis +tasks remains an open challenge for language models. Also, comprehensive, +domain-specific benchmarks requiring high expertise to develop are rare, as are +multilingual benchmarks. This scarcity underscores our contribution's value, +considering most public models are trained predominantly on English corpora, +while other languages remain understudied, particularly for practical +domain-specific NLP tasks. Our benchmark allows for testing and advancing the +state-of-the-art LLMs. As part of our study, we evaluate several pre-trained +multilingual language models on our benchmark to establish strong baselines as +a point of reference. Despite the large size of our datasets (tens to hundreds +of thousands of examples), existing publicly available models struggle with +most tasks, even after in-domain pretraining. We publish all resources +(benchmark suite, pre-trained models, code) under a fully permissive open CC +BY-SA license. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 87 + +
+
+
+ + ☆ OpenIns3D: Snap and Lookup for 3D Open-vocabulary Instance Segmentation + + +
+ Current 3D open-vocabulary scene understanding methods mostly utilize +well-aligned 2D images as the bridge to learn 3D features with language. +However, applying these approaches becomes challenging in scenarios where 2D +images are absent. In this work, we introduce a completely new pipeline, +namely, OpenIns3D, which requires no 2D image inputs, for 3D open-vocabulary +scene understanding at the instance level. The OpenIns3D framework employs a +"Mask-Snap-Lookup" scheme. The "Mask" module learns class-agnostic mask +proposals in 3D point clouds. The "Snap" module generates synthetic scene-level +images at multiple scales and leverages 2D vision language models to extract +interesting objects. The "Lookup" module searches through the outcomes of +"Snap" with the help of Mask2Pixel maps, which contain the precise +correspondence between 3D masks and synthetic images, to assign category names +to the proposed masks. This 2D input-free, easy-to-train, and flexible approach +achieved state-of-the-art results on a wide range of indoor and outdoor +datasets with a large margin. Furthermore, OpenIns3D allows for effortless +switching of 2D detectors without re-training. When integrated with +state-of-the-art 2D open-world models such as ODISE and GroundingDINO, superb +results are observed on open-vocabulary instance segmentation. When integrated +with LLM-powered 2D models like LISA, it demonstrates a remarkable capacity to +process highly complex text queries, including those that require intricate +reasoning and world knowledge. The code and model will be made publicly +available. + +
+
+
+
+
+ + ☆ Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D + Understanding, Generation, and Instruction Following + + +
+ We introduce Point-Bind, a 3D multi-modality model aligning point clouds with +2D image, language, audio, and video. Guided by ImageBind, we construct a joint +embedding space between 3D and multi-modalities, enabling many promising +applications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D +open-world understanding. On top of this, we further present Point-LLM, the +first 3D large language model (LLM) following 3D multi-modal instructions. By +parameter-efficient fine-tuning techniques, Point-LLM injects the semantics of +Point-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction +data, but exhibits superior 3D and multi-modal question-answering capacity. We +hope our work may cast a light on the community for extending 3D point clouds +to multi-modality applications. Code is available at +https://github.com/ZiyuGuo99/Point-Bind_Point-LLM. + +
+
+ comment: Work in progress. Code is available at + https://github.com/ZiyuGuo99/Point-Bind_Point-LLM +
+
+
+
+
+ + ☆ Iterative Multi-granular Image Editing using Diffusion Models + + +
+ Recent advances in text-guided image synthesis has dramatically changed how +creative professionals generate artistic and aesthetically pleasing visual +assets. To fully support such creative endeavors, the process should possess +the ability to: 1) iteratively edit the generations and 2) control the spatial +reach of desired changes (global, local or anything in between). We formalize +this pragmatic problem setting as Iterative Multi-granular Editing. While there +has been substantial progress with diffusion-based models for image synthesis +and editing, they are all one shot (i.e., no iterative editing capabilities) +and do not naturally yield multi-granular control (i.e., covering the full +spectrum of local-to-global edits). To overcome these drawbacks, we propose +EMILIE: Iterative Multi-granular Image Editor. EMILIE introduces a novel latent +iteration strategy, which re-purposes a pre-trained diffusion model to +facilitate iterative editing. This is complemented by a gradient control +operation for multi-granular control. We introduce a new benchmark dataset to +evaluate our newly proposed setting. We conduct exhaustive quantitatively and +qualitatively evaluation against recent state-of-the-art approaches adapted to +our task, to being out the mettle of EMILIE. We hope our work would attract +attention to this newly identified, pragmatic problem setting. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ CityDreamer: Compositional Generative Model of Unbounded 3D Cities + + +
+ In recent years, extensive research has focused on 3D natural scene +generation, but the domain of 3D city generation has not received as much +exploration. This is due to the greater challenges posed by 3D city generation, +mainly because humans are more sensitive to structural distortions in urban +environments. Additionally, generating 3D cities is more complex than 3D +natural scenes since buildings, as objects of the same class, exhibit a wider +range of appearances compared to the relatively consistent appearance of +objects like trees in natural scenes. To address these challenges, we propose +CityDreamer, a compositional generative model designed specifically for +unbounded 3D cities, which separates the generation of building instances from +other background objects, such as roads, green lands, and water areas, into +distinct modules. Furthermore, we construct two datasets, OSM and GoogleEarth, +containing a vast amount of real-world city imagery to enhance the realism of +the generated 3D cities both in their layouts and appearances. Through +extensive experiments, CityDreamer has proven its superiority over +state-of-the-art methods in generating a wide range of lifelike 3D cities. + +
+
+ comment: Project page: https://haozhexie.com/project/city-dreamer +
+
+
+
+
+ + ☆ Time Series Analysis of Urban Liveability + + +
+ In this paper we explore deep learning models to monitor longitudinal +liveability changes in Dutch cities at the neighbourhood level. Our liveability +reference data is defined by a country-wise yearly survey based on a set of +indicators combined into a liveability score, the Leefbaarometer. We pair this +reference data with yearly-available high-resolution aerial images, which +creates yearly timesteps at which liveability can be monitored. We deploy a +convolutional neural network trained on an aerial image from 2016 and the +Leefbaarometer score to predict liveability at new timesteps 2012 and 2020. The +results in a city used for training (Amsterdam) and one never seen during +training (Eindhoven) show some trends which are difficult to interpret, +especially in light of the differences in image acquisitions at the different +time steps. This demonstrates the complexity of liveability monitoring across +time periods and the necessity for more sophisticated methods compensating for +changes unrelated to liveability dynamics. + +
+
+ comment: Accepted at JURSE 2023 +
+
+
+
+
+ + ☆ Discrete Morphological Neural Networks + + +
+ A classical approach to designing binary image operators is Mathematical +Morphology (MM). We propose the Discrete Morphological Neural Networks (DMNN) +for binary image analysis to represent W-operators and estimate them via +machine learning. A DMNN architecture, which is represented by a Morphological +Computational Graph, is designed as in the classical heuristic design of +morphological operators, in which the designer should combine a set of MM +operators and Boolean operations based on prior information and theoretical +knowledge. Then, once the architecture is fixed, instead of adjusting its +parameters (i.e., structural elements or maximal intervals) by hand, we propose +a lattice gradient descent algorithm (LGDA) to train these parameters based on +a sample of input and output images under the usual machine learning approach. +We also propose a stochastic version of the LGDA that is more efficient, is +scalable and can obtain small error in practical problems. The class +represented by a DMNN can be quite general or specialized according to expected +properties of the target operator, i.e., prior information, and the semantic +expressed by algebraic properties of classes of operators is a differential +relative to other methods. The main contribution of this paper is the merger of +the two main paradigms for designing morphological operators: classical +heuristic design and automatic design via machine learning. Thus, conciliating +classical heuristic morphological operator design with machine learning. We +apply the DMNN to recognize the boundary of digits with noise, and we discuss +many topics for future research. + +
+
+
+
+
+ + ☆ Mechanism of feature learning in convolutional neural networks + + +
+ Understanding the mechanism of how convolutional neural networks learn +features from image data is a fundamental problem in machine learning and +computer vision. In this work, we identify such a mechanism. We posit the +Convolutional Neural Feature Ansatz, which states that covariances of filters +in any convolutional layer are proportional to the average gradient outer +product (AGOP) taken with respect to patches of the input to that layer. We +present extensive empirical evidence for our ansatz, including identifying high +correlation between covariances of filters and patch-based AGOPs for +convolutional layers in standard neural architectures, such as AlexNet, VGG, +and ResNets pre-trained on ImageNet. We also provide supporting theoretical +evidence. We then demonstrate the generality of our result by using the +patch-based AGOP to enable deep feature learning in convolutional kernel +machines. We refer to the resulting algorithm as (Deep) ConvRFM and show that +our algorithm recovers similar features to deep convolutional networks +including the notable emergence of edge detectors. Moreover, we find that Deep +ConvRFM overcomes previously identified limitations of convolutional kernels, +such as their inability to adapt to local signals in images and, as a result, +leads to sizable performance improvement over fixed convolutional kernels. + +
+
+
+
+
+ + ☆ Amyloid-Beta Axial Plane PET Synthesis from Structural MRI: An Image + Translation Approach for Screening Alzheimer's Disease + + +
+ In this work, an image translation model is implemented to produce synthetic +amyloid-beta PET images from structural MRI that are quantitatively accurate. +Image pairs of amyloid-beta PET and structural MRI were used to train the +model. We found that the synthetic PET images could be produced with a high +degree of similarity to truth in terms of shape, contrast and overall high SSIM +and PSNR. This work demonstrates that performing structural to quantitative +image translation is feasible to enable the access amyloid-beta information +from only MRI. + +
+
+ comment: Abstract submitted and presented to the International Society of + Magnetic Resonance in Medicine (ISMRM 2023), Toronto, Canada +
+
+
+
+
+ + ☆ Impact of Image Context for Single Deep Learning Face Morphing Attack + Detection + + +
+ The increase in security concerns due to technological advancements has led +to the popularity of biometric approaches that utilize physiological or +behavioral characteristics for enhanced recognition. Face recognition systems +(FRSs) have become prevalent, but they are still vulnerable to image +manipulation techniques such as face morphing attacks. This study investigates +the impact of the alignment settings of input images on deep learning face +morphing detection performance. We analyze the interconnections between the +face contour and image context and suggest optimal alignment conditions for +face morphing detection. + +
+
+ comment: 6 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Trust your Good Friends: Source-free Domain Adaptation by Reciprocal + Neighborhood Clustering + + +
+ Domain adaptation (DA) aims to alleviate the domain shift between source +domain and target domain. Most DA methods require access to the source data, +but often that is not possible (e.g. due to data privacy or intellectual +property). In this paper, we address the challenging source-free domain +adaptation (SFDA) problem, where the source pretrained model is adapted to the +target domain in the absence of source data. Our method is based on the +observation that target data, which might not align with the source domain +classifier, still forms clear clusters. We capture this intrinsic structure by +defining local affinity of the target data, and encourage label consistency +among data with high local affinity. We observe that higher affinity should be +assigned to reciprocal neighbors. To aggregate information with more context, +we consider expanded neighborhoods with small affinity values. Furthermore, we +consider the density around each target sample, which can alleviate the +negative impact of potential outliers. In the experimental results we verify +that the inherent structure of the target features is an important source of +information for domain adaptation. We demonstrate that this local structure can +be efficiently captured by considering the local neighbors, the reciprocal +neighbors, and the expanded neighborhood. Finally, we achieve state-of-the-art +performance on several 2D image and 3D point cloud recognition datasets. + +
+
+ comment: Accepted by IEEE TPAMI, extended version of conference paper + arXiv:2110.04202 +
+
+
+
+
+ + ☆ SQLdepth: Generalizable Self-Supervised Fine-Structured Monocular Depth + Estimation + + +
+ Recently, self-supervised monocular depth estimation has gained popularity +with numerous applications in autonomous driving and robotics. However, +existing solutions primarily seek to estimate depth from immediate visual +features, and struggle to recover fine-grained scene details with limited +generalization. In this paper, we introduce SQLdepth, a novel approach that can +effectively learn fine-grained scene structures from motion. In SQLdepth, we +propose a novel Self Query Layer (SQL) to build a self-cost volume and infer +depth from it, rather than inferring depth from feature maps. The self-cost +volume implicitly captures the intrinsic geometry of the scene within a single +frame. Each individual slice of the volume signifies the relative distances +between points and objects within a latent space. Ultimately, this volume is +compressed to the depth map via a novel decoding approach. Experimental results +on KITTI and Cityscapes show that our method attains remarkable +state-of-the-art performance (AbsRel = $0.082$ on KITTI, $0.052$ on KITTI with +improved ground-truth and $0.106$ on Cityscapes), achieves $9.9\%$, $5.5\%$ and +$4.5\%$ error reduction from the previous best. In addition, our approach +showcases reduced training complexity, computational efficiency, improved +generalization, and the ability to recover fine-grained scene details. +Moreover, the self-supervised pre-trained and metric fine-tuned SQLdepth can +surpass existing supervised methods by significant margins (AbsRel = $0.043$, +$14\%$ error reduction). self-matching-oriented relative distance querying in +SQL improves the robustness and zero-shot generalization capability of +SQLdepth. Code and the pre-trained weights will be publicly available. Code is +available at +\href{https://github.com/hisfog/SQLdepth-Impl}{https://github.com/hisfog/SQLdepth-Impl}. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ A Machine Vision Method for Correction of Eccentric Error: Based on + Adaptive Enhancement Algorithm + + +
+ In the procedure of surface defects detection for large-aperture aspherical +optical elements, it is of vital significance to adjust the optical axis of the +element to be coaxial with the mechanical spin axis accurately. Therefore, a +machine vision method for eccentric error correction is proposed in this paper. +Focusing on the severe defocus blur of reference crosshair image caused by the +imaging characteristic of the aspherical optical element, which may lead to the +failure of correction, an Adaptive Enhancement Algorithm (AEA) is proposed to +strengthen the crosshair image. AEA is consisted of existed Guided Filter Dark +Channel Dehazing Algorithm (GFA) and proposed lightweight Multi-scale Densely +Connected Network (MDC-Net). The enhancement effect of GFA is excellent but +time-consuming, and the enhancement effect of MDC-Net is slightly inferior but +strongly real-time. As AEA will be executed dozens of times during each +correction procedure, its real-time performance is very important. Therefore, +by setting the empirical threshold of definition evaluation function SMD2, GFA +and MDC-Net are respectively applied to highly and slightly blurred crosshair +images so as to ensure the enhancement effect while saving as much time as +possible. AEA has certain robustness in time-consuming performance, which takes +an average time of 0.2721s and 0.0963s to execute GFA and MDC-Net separately on +ten 200pixels 200pixels Region of Interest (ROI) images with different degrees +of blur. And the eccentricity error can be reduced to within 10um by our +method. + +
+
+
+
+
+ + ☆ Multi-stage Deep Learning Artifact Reduction for Computed Tomography + + +
+ In Computed Tomography (CT), an image of the interior structure of an object +is computed from a set of acquired projection images. The quality of these +reconstructed images is essential for accurate analysis, but this quality can +be degraded by a variety of imaging artifacts. To improve reconstruction +quality, the acquired projection images are often processed by a pipeline +consisting of multiple artifact-removal steps applied in various image domains +(e.g., outlier removal on projection images and denoising of reconstruction +images). These artifact-removal methods exploit the fact that certain artifacts +are easier to remove in a certain domain compared with other domains. + Recently, deep learning methods have shown promising results for artifact +removal for CT images. However, most existing deep learning methods for CT are +applied as a post-processing method after reconstruction. Therefore, artifacts +that are relatively difficult to remove in the reconstruction domain may not be +effectively removed by these methods. As an alternative, we propose a +multi-stage deep learning method for artifact removal, in which neural networks +are applied to several domains, similar to a classical CT processing pipeline. +We show that the neural networks can be effectively trained in succession, +resulting in easy-to-use and computationally efficient training. Experiments on +both simulated and real-world experimental datasets show that our method is +effective in reducing artifacts and superior to deep learning-based +post-processing. + +
+
+
+
+
+ + ☆ Asymmetric double-winged multi-view clustering network for exploring + Diverse and Consistent Information + + +
+ In unsupervised scenarios, deep contrastive multi-view clustering (DCMVC) is +becoming a hot research spot, which aims to mine the potential relationships +between different views. Most existing DCMVC algorithms focus on exploring the +consistency information for the deep semantic features, while ignoring the +diverse information on shallow features. To fill this gap, we propose a novel +multi-view clustering network termed CodingNet to explore the diverse and +consistent information simultaneously in this paper. Specifically, instead of +utilizing the conventional auto-encoder, we design an asymmetric structure +network to extract shallow and deep features separately. Then, by aligning the +similarity matrix on the shallow feature to the zero matrix, we ensure the +diversity for the shallow features, thus offering a better description of +multi-view data. Moreover, we propose a dual contrastive mechanism that +maintains consistency for deep features at both view-feature and pseudo-label +levels. Our framework's efficacy is validated through extensive experiments on +six widely used benchmark datasets, outperforming most state-of-the-art +multi-view clustering algorithms. + +
+
+
+
+
+ + ☆ General and Practical Tuning Method for Off-the-Shelf Graph-Based Index: + SISAP Indexing Challenge Report by Team UTokyo + + +
+ Despite the efficacy of graph-based algorithms for Approximate Nearest +Neighbor (ANN) searches, the optimal tuning of such systems remains unclear. +This study introduces a method to tune the performance of off-the-shelf +graph-based indexes, focusing on the dimension of vectors, database size, and +entry points of graph traversal. We utilize a black-box optimization algorithm +to perform integrated tuning to meet the required levels of recall and Queries +Per Second (QPS). We applied our approach to Task A of the SISAP 2023 Indexing +Challenge and got second place in the 10M and 30M tracks. It improves +performance substantially compared to brute force methods. This research offers +a universally applicable tuning method for graph-based indexes, extending +beyond the specific conditions of the competition to broader uses. + +
+
+ comment: Accepted paper on 2nd place solution of SISAP 2023 Indexing Challenge + Task A +
+
+
+
+
+ + ☆ An Improved Encoder-Decoder Framework for Food EnergyEstimation + + +
+ Dietary assessment is essential to maintaining a healthy lifestyle. Automatic +image-based dietary assessment is a growing field of research due to the +increasing prevalence of image capturing devices (e.g. mobile phones). In this +work, we estimate food energy from a single monocular image, a difficult task +due to the limited hard-to-extract amount of energy information present in an +image. To do so, we employ an improved encoder-decoder framework for energy +estimation; the encoder transforms the image into a representation embedded +with food energy information in an easier-to-extract format, which the decoder +then extracts the energy information from. To implement our method, we compile +a high-quality food image dataset verified by registered dietitians containing +eating scene images, food-item segmentation masks, and ground truth calorie +values. Our method improves upon previous caloric estimation methods by over +10\% and 30 kCal in terms of MAPE and MAE respectively. + +
+
+ comment: Accepted for Madima'23 in ACM Multimedia +
+
+
+
+
+ + ☆ A Theoretical and Practical Framework for Evaluating Uncertainty + Calibration in Object Detection + + +
+ The proliferation of Deep Neural Networks has resulted in machine learning +systems becoming increasingly more present in various real-world applications. +Consequently, there is a growing demand for highly reliable models in these +domains, making the problem of uncertainty calibration pivotal, when +considering the future of deep learning. This is especially true when +considering object detection systems, that are commonly present in +safety-critical application such as autonomous driving and robotics. For this +reason, this work presents a novel theoretical and practical framework to +evaluate object detection systems in the context of uncertainty calibration. +The robustness of the proposed uncertainty calibration metrics is shown through +a series of representative experiments. Code for the proposed uncertainty +calibration metrics at: +https://github.com/pedrormconde/Uncertainty_Calibration_Object_Detection. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ dacl10k: Benchmark for Semantic Bridge Damage Segmentation + + +
+ Reliably identifying reinforced concrete defects (RCDs)plays a crucial role +in assessing the structural integrity, traffic safety, and long-term durability +of concrete bridges, which represent the most common bridge type worldwide. +Nevertheless, available datasets for the recognition of RCDs are small in terms +of size and class variety, which questions their usability in real-world +scenarios and their role as a benchmark. Our contribution to this problem is +"dacl10k", an exceptionally diverse RCD dataset for multi-label semantic +segmentation comprising 9,920 images deriving from real-world bridge +inspections. dacl10k distinguishes 12 damage classes as well as 6 bridge +components that play a key role in the building assessment and recommending +actions, such as restoration works, traffic load limitations or bridge +closures. In addition, we examine baseline models for dacl10k which are +subsequently evaluated. The best model achieves a mean intersection-over-union +of 0.42 on the test set. dacl10k, along with our baselines, will be openly +accessible to researchers and practitioners, representing the currently biggest +dataset regarding number of images and class diversity for semantic +segmentation in the bridge inspection domain. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ☆ Unsupervised bias discovery in medical image segmentation MICCAI + + +
+ It has recently been shown that deep learning models for anatomical +segmentation in medical images can exhibit biases against certain +sub-populations defined in terms of protected attributes like sex or ethnicity. +In this context, auditing fairness of deep segmentation models becomes crucial. +However, such audit process generally requires access to ground-truth +segmentation masks for the target population, which may not always be +available, especially when going from development to deployment. Here we +propose a new method to anticipate model biases in biomedical image +segmentation in the absence of ground-truth annotations. Our unsupervised bias +discovery method leverages the reverse classification accuracy framework to +estimate segmentation quality. Through numerical experiments in synthetic and +realistic scenarios we show how our method is able to successfully anticipate +fairness issues in the absence of ground-truth labels, constituting a novel and +valuable tool in this field. + +
+
+ comment: Accepted for publication at FAIMI 2023 (Fairness of AI in Medical + Imaging) at MICCAI +
+
+
+
+
+ + ☆ Improving the matching of deformable objects by learning to detect + keypoints + + +
+ We propose a novel learned keypoint detection method to increase the number +of correct matches for the task of non-rigid image correspondence. By +leveraging true correspondences acquired by matching annotated image pairs with +a specified descriptor extractor, we train an end-to-end convolutional neural +network (CNN) to find keypoint locations that are more appropriate to the +considered descriptor. For that, we apply geometric and photometric warpings to +images to generate a supervisory signal, allowing the optimization of the +detector. Experiments demonstrate that our method enhances the Mean Matching +Accuracy of numerous descriptors when used in conjunction with our detection +method, while outperforming the state-of-the-art keypoint detectors on real +images of non-rigid objects by 20 p.p. We also apply our method on the complex +real-world task of object retrieval where our detector performs on par with the +finest keypoint detectors currently available for this task. The source code +and trained models are publicly available at +https://github.com/verlab/LearningToDetect_PRL_2023 + +
+
+ comment: This is the accepted version of the paper to appear at Pattern + Recognition Letters (PRL). The final journal version will be available at + https://doi.org/10.1016/j.patrec.2023.08.012 +
+
+
+
+
+ + ☆ Selective Scene Text Removal + + +
+ Scene text removal (STR) is the image transformation task to remove text +regions in scene images. The conventional STR methods remove all scene text. +This means that the existing methods cannot select text to be removed. In this +paper, we propose a novel task setting named selective scene text removal +(SSTR) that removes only target words specified by the user. Although SSTR is a +more complex task than STR, the proposed multi-module structure enables +efficient training for SSTR. Experimental results show that the proposed method +can remove target words as expected. + +
+
+ comment: 12 pages, 8 figures, Accepted at the 34th British Machine Vision + Conference +
+
+
+
+
+ + ☆ Fine-grained Recognition with Learnable Semantic Data Augmentation + + +
+ Fine-grained image recognition is a longstanding computer vision challenge +that focuses on differentiating objects belonging to multiple subordinate +categories within the same meta-category. Since images belonging to the same +meta-category usually share similar visual appearances, mining discriminative +visual cues is the key to distinguishing fine-grained categories. Although +commonly used image-level data augmentation techniques have achieved great +success in generic image classification problems, they are rarely applied in +fine-grained scenarios, because their random editing-region behavior is prone +to destroy the discriminative visual cues residing in the subtle regions. In +this paper, we propose diversifying the training data at the feature-level to +alleviate the discriminative region loss problem. Specifically, we produce +diversified augmented samples by translating image features along semantically +meaningful directions. The semantic directions are estimated with a covariance +prediction network, which predicts a sample-wise covariance matrix to adapt to +the large intra-class variation inherent in fine-grained images. Furthermore, +the covariance prediction network is jointly optimized with the classification +network in a meta-learning manner to alleviate the degenerate solution problem. +Experiments on four competitive fine-grained recognition benchmarks +(CUB-200-2011, Stanford Cars, FGVC Aircrafts, NABirds) demonstrate that our +method significantly improves the generalization performance on several popular +classification networks (e.g., ResNets, DenseNets, EfficientNets, RegNets and +ViT). Combined with a recently proposed method, our semantic data augmentation +approach achieves state-of-the-art performance on the CUB-200-2011 dataset. The +source code will be released. + +
+
+
+
+
+ + ☆ VideoGen: A Reference-Guided Latent Diffusion Approach for High + Definition Text-to-Video Generation + + +
+ In this paper, we present VideoGen, a text-to-video generation approach, +which can generate a high-definition video with high frame fidelity and strong +temporal consistency using reference-guided latent diffusion. We leverage an +off-the-shelf text-to-image generation model, e.g., Stable Diffusion, to +generate an image with high content quality from the text prompt, as a +reference image to guide video generation. Then, we introduce an efficient +cascaded latent diffusion module conditioned on both the reference image and +the text prompt, for generating latent video representations, followed by a +flow-based temporal upsampling step to improve the temporal resolution. +Finally, we map latent video representations into a high-definition video +through an enhanced video decoder. During training, we use the first frame of a +ground-truth video as the reference image for training the cascaded latent +diffusion module. The main characterises of our approach include: the reference +image generated by the text-to-image model improves the visual fidelity; using +it as the condition makes the diffusion model focus more on learning the video +dynamics; and the video decoder is trained over unlabeled video data, thus +benefiting from high-quality easily-available videos. VideoGen sets a new +state-of-the-art in text-to-video generation in terms of both qualitative and +quantitative evaluation. + +
+
+ comment: 8pages, 8figures +
+
+
+
+
+ + ☆ Dense Voxel 3D Reconstruction Using a Monocular Event Camera + + +
+ Event cameras are sensors inspired by biological systems that specialize in +capturing changes in brightness. These emerging cameras offer many advantages +over conventional frame-based cameras, including high dynamic range, high frame +rates, and extremely low power consumption. Due to these advantages, event +cameras have increasingly been adapted in various fields, such as frame +interpolation, semantic segmentation, odometry, and SLAM. However, their +application in 3D reconstruction for VR applications is underexplored. Previous +methods in this field mainly focused on 3D reconstruction through depth map +estimation. Methods that produce dense 3D reconstruction generally require +multiple cameras, while methods that utilize a single event camera can only +produce a semi-dense result. Other single-camera methods that can produce dense +3D reconstruction rely on creating a pipeline that either incorporates the +aforementioned methods or other existing Structure from Motion (SfM) or +Multi-view Stereo (MVS) methods. In this paper, we propose a novel approach for +solving dense 3D reconstruction using only a single event camera. To the best +of our knowledge, our work is the first attempt in this regard. Our preliminary +results demonstrate that the proposed method can produce visually +distinguishable dense 3D reconstructions directly without requiring pipelines +like those used by existing methods. Additionally, we have created a synthetic +dataset with $39,739$ object scans using an event camera simulator. This +dataset will help accelerate other relevant research in this field. + +
+
+
+
+
+ + ☆ Long-Term Memorability On Advertisements + + +
+ Marketers spend billions of dollars on advertisements but to what end? At the +purchase time, if customers cannot recognize a brand for which they saw an ad, +the money spent on the ad is essentially wasted. Despite its importance in +marketing, until now, there has been no study on the memorability of ads in the +ML literature. Most studies have been conducted on short-term recall (<5 mins) +on specific content types like object and action videos. On the other hand, the +advertising industry only cares about long-term memorability (a few hours or +longer), and advertisements are almost always highly multimodal, depicting a +story through its different modalities (text, images, and videos). With this +motivation, we conduct the first large scale memorability study consisting of +1203 participants and 2205 ads covering 276 brands. Running statistical tests +over different participant subpopulations and ad-types, we find many +interesting insights into what makes an ad memorable - both content and human +factors. For example, we find that brands which use commercials with fast +moving scenes are more memorable than those with slower scenes (p=8e-10) and +that people who use ad-blockers remember lower number of ads than those who +don't (p=5e-3). Further, with the motivation of simulating the memorability of +marketing materials for a particular audience, ultimately helping create one, +we present a novel model, Sharingan, trained to leverage real-world knowledge +of LLMs and visual knowledge of visual encoders to predict the memorability of +a content. We test our model on all the prominent memorability datasets in +literature (both images and videos) and achieve state of the art across all of +them. We conduct extensive ablation studies across memory types, modality, +brand, and architectural choices to find insights into what drives memory. + +
+
+
+
+
+ + ☆ On the Localization of Ultrasound Image Slices within Point Distribution + Models MICCAI 2023 + + +
+ Thyroid disorders are most commonly diagnosed using high-resolution +Ultrasound (US). Longitudinal nodule tracking is a pivotal diagnostic protocol +for monitoring changes in pathological thyroid morphology. This task, however, +imposes a substantial cognitive load on clinicians due to the inherent +challenge of maintaining a mental 3D reconstruction of the organ. We thus +present a framework for automated US image slice localization within a 3D shape +representation to ease how such sonographic diagnoses are carried out. Our +proposed method learns a common latent embedding space between US image patches +and the 3D surface of an individual's thyroid shape, or a statistical +aggregation in the form of a statistical shape model (SSM), via contrastive +metric learning. Using cross-modality registration and Procrustes analysis, we +leverage features from our model to register US slices to a 3D mesh +representation of the thyroid shape. We demonstrate that our multi-modal +registration framework can localize images on the 3D surface topology of a +patient-specific organ and the mean shape of an SSM. Experimental results +indicate slice positions can be predicted within an average of 1.2 mm of the +ground-truth slice location on the patient-specific 3D anatomy and 4.6 mm on +the SSM, exemplifying its usefulness for slice localization during sonographic +acquisitions. Code is publically available: +\href{https://github.com/vuenc/slice-to-shape}{https://github.com/vuenc/slice-to-shape} + +
+
+ comment: ShapeMI Workshop @ MICCAI 2023; 12 pages 2 figures +
+
+
+
+
+ + ☆ Large Content And Behavior Models To Understand, Simulate, And Optimize + Content And Behavior + + +
+ Shannon, in his seminal paper introducing information theory, divided the +communication into three levels: technical, semantic, and effectivenss. While +the technical level is concerned with accurate reconstruction of transmitted +symbols, the semantic and effectiveness levels deal with the inferred meaning +and its effect on the receiver. Thanks to telecommunications, the first level +problem has produced great advances like the internet. Large Language Models +(LLMs) make some progress towards the second goal, but the third level still +remains largely untouched. The third problem deals with predicting and +optimizing communication for desired receiver behavior. LLMs, while showing +wide generalization capabilities across a wide range of tasks, are unable to +solve for this. One reason for the underperformance could be a lack of +"behavior tokens" in LLMs' training corpora. Behavior tokens define receiver +behavior over a communication, such as shares, likes, clicks, purchases, +retweets, etc. While preprocessing data for LLM training, behavior tokens are +often removed from the corpora as noise. Therefore, in this paper, we make some +initial progress towards reintroducing behavior tokens in LLM training. The +trained models, other than showing similar performance to LLMs on content +understanding tasks, show generalization capabilities on behavior simulation, +content simulation, behavior understanding, and behavior domain adaptation. +Using a wide range of tasks on two corpora, we show results on all these +capabilities. We call these models Large Content and Behavior Models (LCBMs). +Further, to spur more research on LCBMs, we release our new Content Behavior +Corpus (CBC), a repository containing communicator, message, and corresponding +receiver behavior. + +
+
+
+
+
+ + ☆ How You Split Matters: Data Leakage and Subject Characteristics Studies + in Longitudinal Brain MRI Analysis MICCAI + + +
+ Deep learning models have revolutionized the field of medical image analysis, +offering significant promise for improved diagnostics and patient care. +However, their performance can be misleadingly optimistic due to a hidden +pitfall called 'data leakage'. In this study, we investigate data leakage in 3D +medical imaging, specifically using 3D Convolutional Neural Networks (CNNs) for +brain MRI analysis. While 3D CNNs appear less prone to leakage than 2D +counterparts, improper data splitting during cross-validation (CV) can still +pose issues, especially with longitudinal imaging data containing repeated +scans from the same subject. We explore the impact of different data splitting +strategies on model performance for longitudinal brain MRI analysis and +identify potential data leakage concerns. GradCAM visualization helps reveal +shortcuts in CNN models caused by identity confounding, where the model learns +to identify subjects along with diagnostic features. Our findings, consistent +with prior research, underscore the importance of subject-wise splitting and +evaluating our model further on hold-out data from different subjects to ensure +the integrity and reliability of deep learning models in medical image +analysis. + +
+
+ comment: submitted to MICCAI FAIMI 2023 +
+
+
+
+
+ + ☆ MuraNet: Multi-task Floor Plan Recognition with Relation Attention ICDAR 2023 + + +
+ The recognition of information in floor plan data requires the use of +detection and segmentation models. However, relying on several single-task +models can result in ineffective utilization of relevant information when there +are multiple tasks present simultaneously. To address this challenge, we +introduce MuraNet, an attention-based multi-task model for segmentation and +detection tasks in floor plan data. In MuraNet, we adopt a unified encoder +called MURA as the backbone with two separated branches: an enhanced +segmentation decoder branch and a decoupled detection head branch based on +YOLOX, for segmentation and detection tasks respectively. The architecture of +MuraNet is designed to leverage the fact that walls, doors, and windows usually +constitute the primary structure of a floor plan's architecture. By jointly +training the model on both detection and segmentation tasks, we believe MuraNet +can effectively extract and utilize relevant features for both tasks. Our +experiments on the CubiCasa5k public dataset show that MuraNet improves +convergence speed during training compared to single-task models like U-Net and +YOLOv3. Moreover, we observe improvements in the average AP and IoU in +detection and segmentation tasks, respectively.Our ablation experiments +demonstrate that the attention-based unified backbone of MuraNet achieves +better feature extraction in floor plan recognition tasks, and the use of +decoupled multi-head branches for different tasks further improves model +performance. We believe that our proposed MuraNet model can address the +disadvantages of single-task models and improve the accuracy and efficiency of +floor plan data recognition. + +
+
+ comment: Document Analysis and Recognition - ICDAR 2023 Workshops. ICDAR 2023. + Lecture Notes in Computer Science, vol 14193. Springer, Cham +
+
+
+
+
+ + ☆ Towards Contrastive Learning in Music Video Domain + + +
+ Contrastive learning is a powerful way of learning multimodal representations +across various domains such as image-caption retrieval and audio-visual +representation learning. In this work, we investigate if these findings +generalize to the domain of music videos. Specifically, we create a dual +en-coder for the audio and video modalities and train it using a bidirectional +contrastive loss. For the experiments, we use an industry dataset containing +550 000 music videos as well as the public Million Song Dataset, and evaluate +the quality of learned representations on the downstream tasks of music tagging +and genre classification. Our results indicate that pre-trained networks +without contrastive fine-tuning outperform our contrastive learning approach +when evaluated on both tasks. To gain a better understanding of the reasons +contrastive learning was not successful for music videos, we perform a +qualitative analysis of the learned representations, revealing why contrastive +learning might have difficulties uniting embeddings from two modalities. Based +on these findings, we outline possible directions for future work. To +facilitate the reproducibility of our results, we share our code and the +pre-trained model. + +
+
+ comment: 6 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Robust Point Cloud Processing through Positional Embedding + + +
+ End-to-end trained per-point embeddings are an essential ingredient of any +state-of-the-art 3D point cloud processing such as detection or alignment. +Methods like PointNet, or the more recent point cloud transformer -- and its +variants -- all employ learned per-point embeddings. Despite impressive +performance, such approaches are sensitive to out-of-distribution (OOD) noise +and outliers. In this paper, we explore the role of an analytical per-point +embedding based on the criterion of bandwidth. The concept of bandwidth enables +us to draw connections with an alternate per-point embedding -- positional +embedding, particularly random Fourier features. We present compelling robust +results across downstream tasks such as point cloud classification and +registration with several categories of OOD noise. + +
+
+ comment: 18 pages, 13 figures, 5 tables +
+
+
+
+
+ + ☆ Human trajectory prediction using LSTM with Attention mechanism + + +
+ In this paper, we propose a human trajectory prediction model that combines a +Long Short-Term Memory (LSTM) network with an attention mechanism. To do that, +we use attention scores to determine which parts of the input data the model +should focus on when making predictions. Attention scores are calculated for +each input feature, with a higher score indicating the greater significance of +that feature in predicting the output. Initially, these scores are determined +for the target human position, velocity, and their neighboring individual's +positions and velocities. By using attention scores, our model can prioritize +the most relevant information in the input data and make more accurate +predictions. We extract attention scores from our attention mechanism and +integrate them into the trajectory prediction module to predict human future +trajectories. To achieve this, we introduce a new neural layer that processes +attention scores after extracting them and concatenates them with positional +information. We evaluate our approach on the publicly available ETH and UCY +datasets and measure its performance using the final displacement error (FDE) +and average displacement error (ADE) metrics. We show that our modified +algorithm performs better than the Social LSTM in predicting the future +trajectory of pedestrians in crowded spaces. Specifically, our model achieves +an improvement of 6.2% in ADE and 6.3% in FDE compared to the Social LSTM +results in the literature. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ ARFA: An Asymmetric Receptive Field Autoencoder Model for Spatiotemporal + Prediction + + +
+ Spatiotemporal prediction aims to generate future sequences by paradigms +learned from historical contexts. It holds significant importance in numerous +domains, including traffic flow prediction and weather forecasting. However, +existing methods face challenges in handling spatiotemporal correlations, as +they commonly adopt encoder and decoder architectures with identical receptive +fields, which adversely affects prediction accuracy. This paper proposes an +Asymmetric Receptive Field Autoencoder (ARFA) model to address this issue. +Specifically, we design corresponding sizes of receptive field modules tailored +to the distinct functionalities of the encoder and decoder. In the encoder, we +introduce a large kernel module for global spatiotemporal feature extraction. +In the decoder, we develop a small kernel module for local spatiotemporal +information reconstruction. To address the scarcity of meteorological +prediction data, we constructed the RainBench, a large-scale radar echo dataset +specific to the unique precipitation characteristics of inland regions in China +for precipitation prediction. Experimental results demonstrate that ARFA +achieves consistent state-of-the-art performance on two mainstream +spatiotemporal prediction datasets and our RainBench dataset, affirming the +effectiveness of our approach. This work not only explores a novel method from +the perspective of receptive fields but also provides data support for +precipitation prediction, thereby advancing future research in spatiotemporal +prediction. + +
+
+ comment: 0 pages, 5 figures +
+
+
+
+
+ + ☆ Fusing Monocular Images and Sparse IMU Signals for Real-time Human + Motion Capture SIGGRAPH + + +
+ Either RGB images or inertial signals have been used for the task of motion +capture (mocap), but combining them together is a new and interesting topic. We +believe that the combination is complementary and able to solve the inherent +difficulties of using one modality input, including occlusions, extreme +lighting/texture, and out-of-view for visual mocap and global drifts for +inertial mocap. To this end, we propose a method that fuses monocular images +and sparse IMUs for real-time human motion capture. Our method contains a dual +coordinate strategy to fully explore the IMU signals with different goals in +motion capture. To be specific, besides one branch transforming the IMU signals +to the camera coordinate system to combine with the image information, there is +another branch to learn from the IMU signals in the body root coordinate system +to better estimate body poses. Furthermore, a hidden state feedback mechanism +is proposed for both two branches to compensate for their own drawbacks in +extreme input cases. Thus our method can easily switch between the two kinds of +signals or combine them in different cases to achieve a robust mocap. %The two +divided parts can help each other for better mocap results under different +conditions. Quantitative and qualitative results demonstrate that by delicately +designing the fusion method, our technique significantly outperforms the +state-of-the-art vision, IMU, and combined methods on both global orientation +and local pose estimation. Our codes are available for research at +https://shaohua-pan.github.io/robustcap-page/. + +
+
+ comment: Accepted by SIGGRAPH ASIA 2023. Project page: + https://shaohua-pan.github.io/robustcap-page/ +
+
+
+
+
+ + ☆ Efficient Surrogate Models for Materials Science Simulations: Machine + Learning-based Prediction of Microstructure Properties + + +
+ Determining, understanding, and predicting the so-called structure-property +relation is an important task in many scientific disciplines, such as +chemistry, biology, meteorology, physics, engineering, and materials science. +Structure refers to the spatial distribution of, e.g., substances, material, or +matter in general, while property is a resulting characteristic that usually +depends in a non-trivial way on spatial details of the structure. +Traditionally, forward simulations models have been used for such tasks. +Recently, several machine learning algorithms have been applied in these +scientific fields to enhance and accelerate simulation models or as surrogate +models. In this work, we develop and investigate the applications of six +machine learning techniques based on two different datasets from the domain of +materials science: data from a two-dimensional Ising model for predicting the +formation of magnetic domains and data representing the evolution of dual-phase +microstructures from the Cahn-Hilliard model. We analyze the accuracy and +robustness of all models and elucidate the reasons for the differences in their +performances. The impact of including domain knowledge through tailored +features is studied, and general recommendations based on the availability and +quality of training data are derived from this. + +
+
+
+
+
+ + ☆ Fine-Grained Spatiotemporal Motion Alignment for Contrastive Video + Representation Learning ACM MM 2023 + + +
+ As the most essential property in a video, motion information is critical to +a robust and generalized video representation. To inject motion dynamics, +recent works have adopted frame difference as the source of motion information +in video contrastive learning, considering the trade-off between quality and +cost. However, existing works align motion features at the instance level, +which suffers from spatial and temporal weak alignment across modalities. In +this paper, we present a \textbf{Fi}ne-grained \textbf{M}otion +\textbf{A}lignment (FIMA) framework, capable of introducing well-aligned and +significant motion information. Specifically, we first develop a dense +contrastive learning framework in the spatiotemporal domain to generate +pixel-level motion supervision. Then, we design a motion decoder and a +foreground sampling strategy to eliminate the weak alignments in terms of time +and space. Moreover, a frame-level motion contrastive loss is presented to +improve the temporal diversity of the motion features. Extensive experiments +demonstrate that the representations learned by FIMA possess great +motion-awareness capabilities and achieve state-of-the-art or competitive +results on downstream tasks across UCF101, HMDB51, and Diving48 datasets. Code +is available at \url{https://github.com/ZMHH-H/FIMA}. + +
+
+ comment: ACM MM 2023 Camera Ready +
+
+
+
+
+ + ☆ Fast Diffusion EM: a diffusion model for blind inverse problems with + application to deconvolution + + +
+ Using diffusion models to solve inverse problems is a growing field of +research. Current methods assume the degradation to be known and provide +impressive results in terms of restoration quality and diversity. In this work, +we leverage the efficiency of those models to jointly estimate the restored +image and unknown parameters of the degradation model. In particular, we +designed an algorithm based on the well-known Expectation-Minimization (EM) +estimation method and diffusion models. Our method alternates between +approximating the expected log-likelihood of the inverse problem using samples +drawn from a diffusion model and a maximization step to estimate unknown model +parameters. For the maximization step, we also introduce a novel blur kernel +regularization based on a Plug \& Play denoiser. Diffusion models are long to +run, thus we provide a fast version of our algorithm. Extensive experiments on +blind image deblurring demonstrate the effectiveness of our method when +compared to other state-of-the-art approaches. + +
+
+
+
+
+ + ☆ SparseSat-NeRF: Dense Depth Supervised Neural Radiance Fields for Sparse + Satellite Images SP + + +
+ Digital surface model generation using traditional multi-view stereo matching +(MVS) performs poorly over non-Lambertian surfaces, with asynchronous +acquisitions, or at discontinuities. Neural radiance fields (NeRF) offer a new +paradigm for reconstructing surface geometries using continuous volumetric +representation. NeRF is self-supervised, does not require ground truth geometry +for training, and provides an elegant way to include in its representation +physical parameters about the scene, thus potentially remedying the challenging +scenarios where MVS fails. However, NeRF and its variants require many views to +produce convincing scene's geometries which in earth observation satellite +imaging is rare. In this paper we present SparseSat-NeRF (SpS-NeRF) - an +extension of Sat-NeRF adapted to sparse satellite views. SpS-NeRF employs dense +depth supervision guided by crosscorrelation similarity metric provided by +traditional semi-global MVS matching. We demonstrate the effectiveness of our +approach on stereo and tri-stereo Pleiades 1B/WorldView-3 images, and compare +against NeRF and Sat-NeRF. The code is available at +https://github.com/LulinZhang/SpS-NeRF + +
+
+ comment: ISPRS Annals 2023 +
+
+
+
+
+ + ☆ Application of Machine Learning in Melanoma Detection and the + Identification of 'Ugly Duckling' and Suspicious Naevi: A Review + + +
+ Skin lesions known as naevi exhibit diverse characteristics such as size, +shape, and colouration. The concept of an "Ugly Duckling Naevus" comes into +play when monitoring for melanoma, referring to a lesion with distinctive +features that sets it apart from other lesions in the vicinity. As lesions +within the same individual typically share similarities and follow a +predictable pattern, an ugly duckling naevus stands out as unusual and may +indicate the presence of a cancerous melanoma. Computer-aided diagnosis (CAD) +has become a significant player in the research and development field, as it +combines machine learning techniques with a variety of patient analysis +methods. Its aim is to increase accuracy and simplify decision-making, all +while responding to the shortage of specialized professionals. These automated +systems are especially important in skin cancer diagnosis where specialist +availability is limited. As a result, their use could lead to life-saving +benefits and cost reductions within healthcare. Given the drastic change in +survival when comparing early stage to late-stage melanoma, early detection is +vital for effective treatment and patient outcomes. Machine learning (ML) and +deep learning (DL) techniques have gained popularity in skin cancer +classification, effectively addressing challenges, and providing results +equivalent to that of specialists. This article extensively covers modern +Machine Learning and Deep Learning algorithms for detecting melanoma and +suspicious naevi. It begins with general information on skin cancer and +different types of naevi, then introduces AI, ML, DL, and CAD. The article then +discusses the successful applications of various ML techniques like +convolutional neural networks (CNN) for melanoma detection compared to +dermatologists' performance. Lastly, it examines ML methods for UD naevus +detection and identifying suspicious naevi. + +
+
+
+
+
+ + ☆ Interpretable Medical Imagery Diagnosis with Self-Attentive + Transformers: A Review of Explainable AI for Health Care + + +
+ Recent advancements in artificial intelligence (AI) have facilitated its +widespread adoption in primary medical services, addressing the demand-supply +imbalance in healthcare. Vision Transformers (ViT) have emerged as +state-of-the-art computer vision models, benefiting from self-attention +modules. However, compared to traditional machine-learning approaches, +deep-learning models are complex and are often treated as a "black box" that +can cause uncertainty regarding how they operate. Explainable Artificial +Intelligence (XAI) refers to methods that explain and interpret machine +learning models' inner workings and how they come to decisions, which is +especially important in the medical domain to guide the healthcare +decision-making process. This review summarises recent ViT advancements and +interpretative approaches to understanding the decision-making process of ViT, +enabling transparency in medical diagnosis applications. + +
+
+
+
+
+ + ☆ DiffuGen: Adaptable Approach for Generating Labeled Image Datasets using + Stable Diffusion Models + + +
+ Generating high-quality labeled image datasets is crucial for training +accurate and robust machine learning models in the field of computer vision. +However, the process of manually labeling real images is often time-consuming +and costly. To address these challenges associated with dataset generation, we +introduce "DiffuGen," a simple and adaptable approach that harnesses the power +of stable diffusion models to create labeled image datasets efficiently. By +leveraging stable diffusion models, our approach not only ensures the quality +of generated datasets but also provides a versatile solution for label +generation. In this paper, we present the methodology behind DiffuGen, which +combines the capabilities of diffusion models with two distinct labeling +techniques: unsupervised and supervised. Distinctively, DiffuGen employs prompt +templating for adaptable image generation and textual inversion to enhance +diffusion model capabilities. + +
+
+
+
+
+ + ☆ Object-Centric Multiple Object Tracking ICCV 2023 + + +
+ Unsupervised object-centric learning methods allow the partitioning of scenes +into entities without additional localization information and are excellent +candidates for reducing the annotation burden of multiple-object tracking (MOT) +pipelines. Unfortunately, they lack two key properties: objects are often split +into parts and are not consistently tracked over time. In fact, +state-of-the-art models achieve pixel-level accuracy and temporal consistency +by relying on supervised object detection with additional ID labels for the +association through time. This paper proposes a video object-centric model for +MOT. It consists of an index-merge module that adapts the object-centric slots +into detection outputs and an object memory module that builds complete object +prototypes to handle occlusions. Benefited from object-centric learning, we +only require sparse detection labels (0%-6.25%) for object localization and +feature binding. Relying on our self-supervised +Expectation-Maximization-inspired loss for object association, our approach +requires no ID labels. Our experiments significantly narrow the gap between the +existing object-centric model and the fully supervised state-of-the-art and +outperform several unsupervised trackers. + +
+
+ comment: ICCV 2023 camera-ready version +
+
+
+
+
+ + ☆ What Makes Good Open-Vocabulary Detector: A Disassembling Perspective + + +
+ Open-vocabulary detection (OVD) is a new object detection paradigm, aiming to +localize and recognize unseen objects defined by an unbounded vocabulary. This +is challenging since traditional detectors can only learn from pre-defined +categories and thus fail to detect and localize objects out of pre-defined +vocabulary. To handle the challenge, OVD leverages pre-trained cross-modal VLM, +such as CLIP, ALIGN, etc. Previous works mainly focus on the open vocabulary +classification part, with less attention on the localization part. We argue +that for a good OVD detector, both classification and localization should be +parallelly studied for the novel object categories. We show in this work that +improving localization as well as cross-modal classification complement each +other, and compose a good OVD detector jointly. We analyze three families of +OVD methods with different design emphases. We first propose a vanilla +method,i.e., cropping a bounding box obtained by a localizer and resizing it +into the CLIP. We next introduce another approach, which combines a standard +two-stage object detector with CLIP. A two-stage object detector includes a +visual backbone, a region proposal network (RPN), and a region of interest +(RoI) head. We decouple RPN and ROI head (DRR) and use RoIAlign to extract +meaningful features. In this case, it avoids resizing objects. To further +accelerate the training time and reduce the model parameters, we couple RPN and +ROI head (CRR) as the third approach. We conduct extensive experiments on these +three types of approaches in different settings. On the OVD-COCO benchmark, DRR +obtains the best performance and achieves 35.8 Novel AP$_{50}$, an absolute 2.8 +gain over the previous state-of-the-art (SOTA). For OVD-LVIS, DRR surpasses the +previous SOTA by 1.9 AP$_{50}$ in rare categories. We also provide an object +detection dataset called PID and provide a baseline on PID. + +
+
+
+
+
+ + ☆ Human-Inspired Facial Sketch Synthesis with Dynamic Adaptation ICCV'23 + + +
+ Facial sketch synthesis (FSS) aims to generate a vivid sketch portrait from a +given facial photo. Existing FSS methods merely rely on 2D representations of +facial semantic or appearance. However, professional human artists usually use +outlines or shadings to covey 3D geometry. Thus facial 3D geometry (e.g. depth +map) is extremely important for FSS. Besides, different artists may use diverse +drawing techniques and create multiple styles of sketches; but the style is +globally consistent in a sketch. Inspired by such observations, in this paper, +we propose a novel Human-Inspired Dynamic Adaptation (HIDA) method. Specially, +we propose to dynamically modulate neuron activations based on a joint +consideration of both facial 3D geometry and 2D appearance, as well as globally +consistent style control. Besides, we use deformable convolutions at +coarse-scales to align deep features, for generating abstract and distinct +outlines. Experiments show that HIDA can generate high-quality sketches in +multiple styles, and significantly outperforms previous methods, over a large +range of challenging faces. Besides, HIDA allows precise style control of the +synthesized sketch, and generalizes well to natural scenes and other artistic +styles. Our code and results have been released online at: +https://github.com/AiArt-HDU/HIDA. + +
+
+ comment: To appear on ICCV'23 +
+
+
+
+
+ + ☆ Towards Addressing the Misalignment of Object Proposal Evaluation for + Vision-Language Tasks via Semantic Grounding WACV 2024 + + +
+ Object proposal generation serves as a standard pre-processing step in +Vision-Language (VL) tasks (image captioning, visual question answering, etc.). +The performance of object proposals generated for VL tasks is currently +evaluated across all available annotations, a protocol that we show is +misaligned - higher scores do not necessarily correspond to improved +performance on downstream VL tasks. Our work serves as a study of this +phenomenon and explores the effectiveness of semantic grounding to mitigate its +effects. To this end, we propose evaluating object proposals against only a +subset of available annotations, selected by thresholding an annotation +importance score. Importance of object annotations to VL tasks is quantified by +extracting relevant semantic information from text describing the image. We +show that our method is consistent and demonstrates greatly improved alignment +with annotations selected by image captioning metrics and human annotation when +compared against existing techniques. Lastly, we compare current detectors used +in the Scene Graph Generation (SGG) benchmark as a use case, which serves as an +example of when traditional object proposal evaluation techniques are +misaligned. + +
+
+ comment: Accepted to WACV 2024 (Round 1) +
+
+
+
+
+ + ☆ Gap and Overlap Detection in Automated Fiber Placement + + +
+ The identification and correction of manufacturing defects, particularly gaps +and overlaps, are crucial for ensuring high-quality composite parts produced +through Automated Fiber Placement (AFP). These imperfections are the most +commonly observed issues that can significantly impact the overall quality of +the composite parts. Manual inspection is both time-consuming and +labor-intensive, making it an inefficient approach. To overcome this challenge, +the implementation of an automated defect detection system serves as the +optimal solution. In this paper, we introduce a novel method that uses an +Optical Coherence Tomography (OCT) sensor and computer vision techniques to +detect and locate gaps and overlaps in composite parts. Our approach involves +generating a depth map image of the composite surface that highlights the +elevation of composite tapes (or tows) on the surface. By detecting the +boundaries of each tow, our algorithm can compare consecutive tows and identify +gaps or overlaps that may exist between them. Any gaps or overlaps exceeding a +predefined tolerance threshold are considered manufacturing defects. To +evaluate the performance of our approach, we compare the detected defects with +the ground truth annotated by experts. The results demonstrate a high level of +accuracy and efficiency in gap and overlap segmentation. + +
+
+
+
+
+ + ☆ Diffusion Model with Clustering-based Conditioning for Food Image + Generation + + +
+ Image-based dietary assessment serves as an efficient and accurate solution +for recording and analyzing nutrition intake using eating occasion images as +input. Deep learning-based techniques are commonly used to perform image +analysis such as food classification, segmentation, and portion size +estimation, which rely on large amounts of food images with annotations for +training. However, such data dependency poses significant barriers to +real-world applications, because acquiring a substantial, diverse, and balanced +set of food images can be challenging. One potential solution is to use +synthetic food images for data augmentation. Although existing work has +explored the use of generative adversarial networks (GAN) based structures for +generation, the quality of synthetic food images still remains subpar. In +addition, while diffusion-based generative models have shown promising results +for general image generation tasks, the generation of food images can be +challenging due to the substantial intra-class variance. In this paper, we +investigate the generation of synthetic food images based on the conditional +diffusion model and propose an effective clustering-based training framework, +named ClusDiff, for generating high-quality and representative food images. The +proposed method is evaluated on the Food-101 dataset and shows improved +performance when compared with existing image generation works. We also +demonstrate that the synthetic food images generated by ClusDiff can help +address the severe class imbalance issue in long-tailed food classification +using the VFN-LT dataset. + +
+
+ comment: Accepted for 31st ACM International Conference on Multimedia: 8th + International Workshop on Multimedia Assisted Dietary Management (MADiMa + 2023) +
+
+
+
+
+ + ☆ DARC: Distribution-Aware Re-Coloring Model for Generalizable Nucleus + Segmentation MICCAI 2023 + + +
+ Nucleus segmentation is usually the first step in pathological image analysis +tasks. Generalizable nucleus segmentation refers to the problem of training a +segmentation model that is robust to domain gaps between the source and target +domains. The domain gaps are usually believed to be caused by the varied image +acquisition conditions, e.g., different scanners, tissues, or staining +protocols. In this paper, we argue that domain gaps can also be caused by +different foreground (nucleus)-background ratios, as this ratio significantly +affects feature statistics that are critical to normalization layers. We +propose a Distribution-Aware Re-Coloring (DARC) model that handles the above +challenges from two perspectives. First, we introduce a re-coloring method that +relieves dramatic image color variations between different domains. Second, we +propose a new instance normalization method that is robust to the variation in +foreground-background ratios. We evaluate the proposed methods on two H$\&$E +stained image datasets, named CoNSeP and CPM17, and two IHC stained image +datasets, called DeepLIIF and BC-DeepLIIF. Extensive experimental results +justify the effectiveness of our proposed DARC model. Codes are available at +\url{https://github.com/csccsccsccsc/DARC + +
+
+ comment: Accepted by MICCAI 2023 +
+
+
+
+
+ + ☆ Vision-aided nonlinear control framework for shake table tests + + +
+ The structural response under the earthquake excitations can be simulated by +scaled-down model shake table tests or full-scale model shake table tests. In +this paper, adaptive control theory is used as a nonlinear shake table control +algorithm which considers the inherent nonlinearity of the shake table system +and the Control-Structural Interaction (CSI) effect that the linear controller +cannot consider, such as the Proportional-Integral-Derivative (PID) controller. +The mass of the specimen can be assumed as an unknown variation and the unknown +parameter will be replaced by an estimated value in the proposed control +framework. The signal generated by the control law of the adaptive control +method will be implemented by a loop-shaping controller. To verify the +stability and feasibility of the proposed control framework, a simulation of a +bare shake table and experiments with a bare shake table with a two-story frame +were carried out. This study randomly selects Earthquake recordings from the +Pacific Earthquake Engineering Research Center (PEER) database. The simulation +and experimental results show that the proposed control framework can be +effectively used in shake table control. + +
+
+ comment: 10 pages, 7 figures, accepted in the Canadian Conference - Pacific + Conference on Earthquake Engineering 2023, Vancouver, British Columbia +
+
+
+
+
+ + ♻ ☆ Contrastive Image Synthesis and Self-supervised Feature Adaptation for + Cross-Modality Biomedical Image Segmentation + + +
+ This work presents a novel framework CISFA (Contrastive Image synthesis and +Self-supervised Feature Adaptation)that builds on image domain translation and +unsupervised feature adaptation for cross-modality biomedical image +segmentation. Different from existing works, we use a one-sided generative +model and add a weighted patch-wise contrastive loss between sampled patches of +the input image and the corresponding synthetic image, which serves as shape +constraints. Moreover, we notice that the generated images and input images +share similar structural information but are in different modalities. As such, +we enforce contrastive losses on the generated images and the input images to +train the encoder of a segmentation model to minimize the discrepancy between +paired images in the learned embedding space. Compared with existing works that +rely on adversarial learning for feature adaptation, such a method enables the +encoder to learn domain-independent features in a more explicit way. We +extensively evaluate our methods on segmentation tasks containing CT and MRI +images for abdominal cavities and whole hearts. Experimental results show that +the proposed framework not only outputs synthetic images with less distortion +of organ shapes, but also outperforms state-of-the-art domain adaptation +methods by a large margin. + +
+
+
+
+
+ + ♻ ☆ SSD-MonoDETR: Supervised Scale-aware Deformable Transformer for + Monocular 3D Object Detection + + +
+ Transformer-based methods have demonstrated superior performance for +monocular 3D object detection recently, which aims at predicting 3D attributes +from a single 2D image. Most existing transformer-based methods leverage both +visual and depth representations to explore valuable query points on objects, +and the quality of the learned query points has a great impact on detection +accuracy. Unfortunately, existing unsupervised attention mechanisms in +transformers are prone to generate low-quality query features due to inaccurate +receptive fields, especially on hard objects. To tackle this problem, this +paper proposes a novel "Supervised Scale-aware Deformable Attention" (SSDA) for +monocular 3D object detection. Specifically, SSDA presets several masks with +different scales and utilizes depth and visual features to adaptively learn a +scale-aware filter for object query augmentation. Imposing the scale awareness, +SSDA could well predict the accurate receptive field of an object query to +support robust query feature generation. Aside from this, SSDA is assigned with +a Weighted Scale Matching (WSM) loss to supervise scale prediction, which +presents more confident results as compared to the unsupervised attention +mechanisms. Extensive experiments on the KITTI and Waymo Open datasets +demonstrate that SSDA significantly improves the detection accuracy, especially +on moderate and hard objects, yielding state-of-the-art performance as compared +to the existing approaches. Our code will be made publicly available at +https://github.com/mikasa3lili/SSD-MonoDETR. + +
+
+ comment: Accepted to IEEE Transactions on Intelligent Vehicles (T-IV). Code + will be made publicly available at + https://github.com/mikasa3lili/SSD-MonoDETR +
+
+
+
+
+ + ♻ ☆ M^2UNet: MetaFormer Multi-scale Upsampling Network for Polyp + Segmentation + + +
+ Polyp segmentation has recently garnered significant attention, and multiple +methods have been formulated to achieve commendable outcomes. However, these +techniques often confront difficulty when working with the complex polyp +foreground and their surrounding regions because of the nature of convolution +operation. Besides, most existing methods forget to exploit the potential +information from multiple decoder stages. To address this challenge, we suggest +combining MetaFormer, introduced as a baseline for integrating CNN and +Transformer, with UNet framework and incorporating our Multi-scale Upsampling +block (MU). This simple module makes it possible to combine multi-level +information by exploring multiple receptive field paths of the shallow decoder +stage and then adding with the higher stage to aggregate better feature +representation, which is essential in medical image segmentation. Taken all +together, we propose MetaFormer Multi-scale Upsampling Network (M$^2$UNet) for +the polyp segmentation task. Extensive experiments on five benchmark datasets +demonstrate that our method achieved competitive performance compared with +several previous methods. + +
+
+
+
+
+ + ♻ ☆ Mapping the landscape of histomorphological cancer phenotypes using + self-supervised learning on unlabeled, unannotated pathology slides + + +
+ Definitive cancer diagnosis and management depend upon the extraction of +information from microscopy images by pathologists. These images contain +complex information requiring time-consuming expert human interpretation that +is prone to human bias. Supervised deep learning approaches have proven +powerful for classification tasks, but they are inherently limited by the cost +and quality of annotations used for training these models. To address this +limitation of supervised methods, we developed Histomorphological Phenotype +Learning (HPL), a fully blue{self-}supervised methodology that requires no +expert labels or annotations and operates via the automatic discovery of +discriminatory image features in small image tiles. Tiles are grouped into +morphologically similar clusters which constitute a library of +histomorphological phenotypes, revealing trajectories from benign to malignant +tissue via inflammatory and reactive phenotypes. These clusters have distinct +features which can be identified using orthogonal methods, linking histologic, +molecular and clinical phenotypes. Applied to lung cancer tissues, we show that +they align closely with patient survival, with histopathologically recognised +tumor types and growth patterns, and with transcriptomic measures of +immunophenotype. We then demonstrate that these properties are maintained in a +multi-cancer study. These results show the clusters represent recurrent host +responses and modes of tumor growth emerging under natural selection. Code, +pre-trained models, learned embeddings, and documentation are available to the +community at +https://github.com/AdalbertoCq/Histomorphological-Phenotype-Learning + +
+
+
+
+
+ + ♻ ☆ Clutter Detection and Removal in 3D Scenes with View-Consistent + Inpainting ICCV 2023 + + +
+ Removing clutter from scenes is essential in many applications, ranging from +privacy-concerned content filtering to data augmentation. In this work, we +present an automatic system that removes clutter from 3D scenes and inpaints +with coherent geometry and texture. We propose techniques for its two key +components: 3D segmentation from shared properties and 3D inpainting, both of +which are important problems. The definition of 3D scene clutter +(frequently-moving objects) is not well captured by commonly-studied object +categories in computer vision. To tackle the lack of well-defined clutter +annotations, we group noisy fine-grained labels, leverage virtual rendering, +and impose an instance-level area-sensitive loss. Once clutter is removed, we +inpaint geometry and texture in the resulting holes by merging inpainted RGB-D +images. This requires novel voting and pruning strategies that guarantee +multi-view consistency across individually inpainted images for mesh +reconstruction. Experiments on ScanNet and Matterport dataset show that our +method outperforms baselines for clutter segmentation and 3D inpainting, both +visually and quantitatively. + +
+
+ comment: 18 pages. ICCV 2023. Project page: + https://weify627.github.io/clutter/ +
+
+
+
+
+ + ♻ ☆ Making a Case for 3D Convolutions for Object Segmentation in Videos BMVC '20 + + +
+ The task of object segmentation in videos is usually accomplished by +processing appearance and motion information separately using standard 2D +convolutional networks, followed by a learned fusion of the two sources of +information. On the other hand, 3D convolutional networks have been +successfully applied for video classification tasks, but have not been +leveraged as effectively to problems involving dense per-pixel interpretation +of videos compared to their 2D convolutional counterparts and lag behind the +aforementioned networks in terms of performance. In this work, we show that 3D +CNNs can be effectively applied to dense video prediction tasks such as salient +object segmentation. We propose a simple yet effective encoder-decoder network +architecture consisting entirely of 3D convolutions that can be trained +end-to-end using a standard cross-entropy loss. To this end, we leverage an +efficient 3D encoder, and propose a 3D decoder architecture, that comprises +novel 3D Global Convolution layers and 3D Refinement modules. Our approach +outperforms existing state-of-the-arts by a large margin on the DAVIS'16 +Unsupervised, FBMS and ViSal dataset benchmarks in addition to being faster, +thus showing that our architecture can efficiently learn expressive +spatio-temporal features and produce high quality video segmentation masks. We +have made our code and trained models publicly available at +https://github.com/sabarim/3DC-Seg. + +
+
+ comment: BMVC '20 +
+
+
+
+
+ + ♻ ☆ One Object at a Time: Accurate and Robust Structure From Motion for + Robots IROS + + +
+ A gaze-fixating robot perceives distance to the fixated object and relative +positions of surrounding objects immediately, accurately, and robustly. We show +how fixation, which is the act of looking at one object while moving, exploits +regularities in the geometry of 3D space to obtain this information. These +regularities introduce rotation-translation couplings that are not commonly +used in structure from motion. To validate, we use a Franka Emika Robot with an +RGB camera. We a) find that error in distance estimate is less than 5 mm at a +distance of 15 cm, and b) show how relative position can be used to find +obstacles under challenging scenarios. We combine accurate distance estimates +and obstacle information into a reactive robot behavior that is able to pick up +objects of unknown size, while impeded by unforeseen obstacles. Project page: +https://oxidification.com/p/one-object-at-a-time/ . + +
+
+ comment: v3: Add link to project page v2: Update DOI v1: Accepted at 2022 + IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) +
+
+
+
+
+ + ♻ ☆ Self-Supervised Representation Learning with Cross-Context Learning + between Global and Hypercolumn Features + + +
+ Whilst contrastive learning yields powerful representations by matching +different augmented views of the same instance, it lacks the ability to capture +the similarities between different instances. One popular way to address this +limitation is by learning global features (after the global pooling) to capture +inter-instance relationships based on knowledge distillation, where the global +features of the teacher are used to guide the learning of the global features +of the student. Inspired by cross-modality learning, we extend this existing +framework that only learns from global features by encouraging the global +features and intermediate layer features to learn from each other. This leads +to our novel self-supervised framework: cross-context learning between global +and hypercolumn features (CGH), that enforces the consistency of instance +relations between low- and high-level semantics. Specifically, we stack the +intermediate feature maps to construct a hypercolumn representation so that we +can measure instance relations using two contexts (hypercolumn and global +feature) separately, and then use the relations of one context to guide the +learning of the other. This cross-context learning allows the model to learn +from the differences between the two contexts. The experimental results on +linear classification and downstream tasks show that our method outperforms the +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ STEm-Seg: Spatio-temporal Embeddings for Instance Segmentation in Videos ECCV 2020 + + +
+ Existing methods for instance segmentation in videos typically involve +multi-stage pipelines that follow the tracking-by-detection paradigm and model +a video clip as a sequence of images. Multiple networks are used to detect +objects in individual frames, and then associate these detections over time. +Hence, these methods are often non-end-to-end trainable and highly tailored to +specific tasks. In this paper, we propose a different approach that is +well-suited to a variety of tasks involving instance segmentation in videos. In +particular, we model a video clip as a single 3D spatio-temporal volume, and +propose a novel approach that segments and tracks instances across space and +time in a single stage. Our problem formulation is centered around the idea of +spatio-temporal embeddings which are trained to cluster pixels belonging to a +specific object instance over an entire video clip. To this end, we introduce +(i) novel mixing functions that enhance the feature representation of +spatio-temporal embeddings, and (ii) a single-stage, proposal-free network that +can reason about temporal context. Our network is trained end-to-end to learn +spatio-temporal embeddings as well as parameters required to cluster these +embeddings, thus simplifying inference. Our method achieves state-of-the-art +results across multiple datasets and tasks. Code and models are available at +https://github.com/sabarim/STEm-Seg. + +
+
+ comment: ECCV 2020 28 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ CLIPAG: Towards Generator-Free Text-to-Image Generation + + +
+ Perceptually Aligned Gradients (PAG) refer to an intriguing property observed +in robust image classification models, wherein their input gradients align with +human perception and pose semantic meanings. While this phenomenon has gained +significant research attention, it was solely studied in the context of +unimodal vision-only architectures. In this work, we extend the study of PAG to +Vision-Language architectures, which form the foundations for diverse +image-text tasks and applications. Through an adversarial robustification +finetuning of CLIP, we demonstrate that robust Vision-Language models exhibit +PAG in contrast to their vanilla counterparts. This work reveals the merits of +CLIP with PAG (CLIPAG) in several vision-language generative tasks. Notably, we +show that seamlessly integrating CLIPAG in a "plug-n-play" manner leads to +substantial improvements in vision-language generative applications. +Furthermore, leveraging its PAG property, CLIPAG enables text-to-image +generation without any generative model, which typically requires huge +generators. + +
+
+
+
+
+ + ♻ ☆ Sparse resultant based minimal solvers in computer vision and their + connection with the action matrix + + +
+ Many computer vision applications require robust and efficient estimation of +camera geometry from a minimal number of input data measurements, i.e., solving +minimal problems in a RANSAC framework. Minimal problems are usually formulated +as complex systems of sparse polynomials. The systems usually are +overdetermined and consist of polynomials with algebraically constrained +coefficients. Most state-of-the-art efficient polynomial solvers are based on +the action matrix method that has been automated and highly optimized in recent +years. On the other hand, the alternative theory of sparse resultants and +Newton polytopes has been less successful for generating efficient solvers, +primarily because the polytopes do not respect the constraints on the +coefficients. Therefore, in this paper, we propose a simple iterative scheme to +test various subsets of the Newton polytopes and search for the most efficient +solver. Moreover, we propose to use an extra polynomial with a special form to +further improve the solver efficiency via a Schur complement computation. We +show that for some camera geometry problems our extra polynomial-based method +leads to smaller and more stable solvers than the state-of-the-art Grobner +basis-based solvers. The proposed method can be fully automated and +incorporated into existing tools for automatic generation of efficient +polynomial solvers. It provides a competitive alternative to popular Grobner +basis-based methods for minimal problems in computer vision. We also study the +conditions under which the minimal solvers generated by the state-of-the-art +action matrix-based methods and the proposed extra polynomial resultant-based +method, are equivalent. Specifically we consider a step-by-step comparison +between the approaches based on the action matrix and the sparse resultant, +followed by a set of substitutions, which would lead to equivalent minimal +solvers. + +
+
+ comment: arXiv admin note: text overlap with arXiv:1912.10268 +
+
+
+
+
+ + ♻ ☆ Viewset Diffusion: (0-)Image-Conditioned 3D Generative Models from 2D + Data + + +
+ We present Viewset Diffusion, a diffusion-based generator that outputs 3D +objects while only using multi-view 2D data for supervision. We note that there +exists a one-to-one mapping between viewsets, i.e., collections of several 2D +views of an object, and 3D models. Hence, we train a diffusion model to +generate viewsets, but design the neural network generator to reconstruct +internally corresponding 3D models, thus generating those too. We fit a +diffusion model to a large number of viewsets for a given category of objects. +The resulting generator can be conditioned on zero, one or more input views. +Conditioned on a single view, it performs 3D reconstruction accounting for the +ambiguity of the task and allowing to sample multiple solutions compatible with +the input. The model performs reconstruction efficiently, in a feed-forward +manner, and is trained using only rendering losses using as few as three views +per viewset. Project page: szymanowiczs.github.io/viewset-diffusion. + +
+
+ comment: International Conference on Computer Vision 2023 +
+
+
+
+
+ + ♻ ☆ GNFactor: Multi-Task Real Robot Learning with Generalizable Neural + Feature Fields + + +
+ It is a long-standing problem in robotics to develop agents capable of +executing diverse manipulation tasks from visual observations in unstructured +real-world environments. To achieve this goal, the robot needs to have a +comprehensive understanding of the 3D structure and semantics of the scene. In +this work, we present $\textbf{GNFactor}$, a visual behavior cloning agent for +multi-task robotic manipulation with $\textbf{G}$eneralizable $\textbf{N}$eural +feature $\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural +field (GNF) as a reconstruction module and a Perceiver Transformer as a +decision-making module, leveraging a shared deep 3D voxel representation. To +incorporate semantics in 3D, the reconstruction module utilizes a +vision-language foundation model ($\textit{e.g.}$, Stable Diffusion) to distill +rich semantic information into the deep 3D voxel. We evaluate GNFactor on 3 +real robot tasks and perform detailed ablations on 10 RLBench tasks with a +limited number of demonstrations. We observe a substantial improvement of +GNFactor over current state-of-the-art methods in seen and unseen tasks, +demonstrating the strong generalization ability of GNFactor. Our project +website is https://yanjieze.com/GNFactor/ . + +
+
+ comment: CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/ +
+
+
+
+
+ + ♻ ☆ LaserMix for Semi-Supervised LiDAR Semantic Segmentation CVPR 2023 + + +
+ Densely annotating LiDAR point clouds is costly, which restrains the +scalability of fully-supervised learning methods. In this work, we study the +underexplored semi-supervised learning (SSL) in LiDAR segmentation. Our core +idea is to leverage the strong spatial cues of LiDAR point clouds to better +exploit unlabeled data. We propose LaserMix to mix laser beams from different +LiDAR scans, and then encourage the model to make consistent and confident +predictions before and after mixing. Our framework has three appealing +properties: 1) Generic: LaserMix is agnostic to LiDAR representations (e.g., +range view and voxel), and hence our SSL framework can be universally applied. +2) Statistically grounded: We provide a detailed analysis to theoretically +explain the applicability of the proposed framework. 3) Effective: +Comprehensive experimental analysis on popular LiDAR segmentation datasets +(nuScenes, SemanticKITTI, and ScribbleKITTI) demonstrates our effectiveness and +superiority. Notably, we achieve competitive results over fully-supervised +counterparts with 2x to 5x fewer labels and improve the supervised-only +baseline significantly by 10.8% on average. We hope this concise yet +high-performing framework could facilitate future research in semi-supervised +LiDAR segmentation. Code is publicly available. + +
+
+ comment: CVPR 2023 (Highlight); 27 pages, 11 figures, 12 tables; Code at + https://github.com/ldkong1205/LaserMix +
+
+
+
+
+ + ♻ ☆ StyleHumanCLIP: Text-guided Garment Manipulation for StyleGAN-Human + + +
+ This paper tackles text-guided control of StyleGAN for editing garments in +full-body human images. Existing StyleGAN-based methods suffer from handling +the rich diversity of garments and body shapes and poses. We propose a +framework for text-guided full-body human image synthesis via an +attention-based latent code mapper, which enables more disentangled control of +StyleGAN than existing mappers. Our latent code mapper adopts an attention +mechanism that adaptively manipulates individual latent codes on different +StyleGAN layers under text guidance. In addition, we introduce feature-space +masking at inference time to avoid unwanted changes caused by text inputs. Our +quantitative and qualitative evaluations reveal that our method can control +generated images more faithfully to given texts than existing methods. + +
+
+
+
+
+ + ♻ ☆ VisAlign: Dataset for Measuring the Degree of Alignment between AI and + Humans in Visual Perception + + +
+ AI alignment refers to models acting towards human-intended goals, +preferences, or ethical principles. Given that most large-scale deep learning +models act as black boxes and cannot be manually controlled, analyzing the +similarity between models and humans can be a proxy measure for ensuring AI +safety. In this paper, we focus on the models' visual perception alignment with +humans, further referred to as AI-human visual alignment. Specifically, we +propose a new dataset for measuring AI-human visual alignment in terms of image +classification, a fundamental task in machine perception. In order to evaluate +AI-human visual alignment, a dataset should encompass samples with various +scenarios that may arise in the real world and have gold human perception +labels. Our dataset consists of three groups of samples, namely Must-Act (i.e., +Must-Classify), Must-Abstain, and Uncertain, based on the quantity and clarity +of visual information in an image and further divided into eight categories. +All samples have a gold human perception label; even Uncertain (severely +blurry) sample labels were obtained via crowd-sourcing. The validity of our +dataset is verified by sampling theory, statistical theories related to survey +design, and experts in the related fields. Using our dataset, we analyze the +visual alignment and reliability of five popular visual perception models and +seven abstention methods. Our code and data is available at +\url{https://github.com/jiyounglee-0523/VisAlign}. + +
+
+
+
+
+ + ♻ ☆ Contrastive Learning for Lane Detection via Cross-Similarity + + +
+ Detecting road lanes is challenging due to intricate markings vulnerable to +unfavorable conditions. Lane markings have strong shape priors, but their +visibility is easily compromised. Factors like lighting, weather, vehicles, +pedestrians, and aging colors challenge the detection. A large amount of data +is required to train a lane detection approach that can withstand natural +variations caused by low visibility. This is because there are numerous lane +shapes and natural variations that exist. Our solution, Contrastive Learning +for Lane Detection via cross-similarity (CLLD), is a self-supervised learning +method that tackles this challenge by enhancing lane detection models +resilience to real-world conditions that cause lane low visibility. CLLD is a +novel multitask contrastive learning that trains lane detection approaches to +detect lane markings even in low visible situations by integrating local +feature contrastive learning (CL) with our new proposed operation +cross-similarity. Local feature CL focuses on extracting features for small +image parts, which is necessary to localize lane segments, while +cross-similarity captures global features to detect obscured lane segments +using their surrounding. We enhance cross-similarity by randomly masking parts +of input images for augmentation. Evaluated on benchmark datasets, CLLD +outperforms state-of-the-art contrastive learning, especially in +visibility-impairing conditions like shadows. Compared to supervised learning, +CLLD excels in scenarios like shadows and crowded scenes. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ FLatten Transformer: Vision Transformer using Focused Linear Attention ICCV 2023 + + +
+ The quadratic computation complexity of self-attention has been a persistent +challenge when applying Transformer models to vision tasks. Linear attention, +on the other hand, offers a much more efficient alternative with its linear +complexity by approximating the Softmax operation through carefully designed +mapping functions. However, current linear attention approaches either suffer +from significant performance degradation or introduce additional computation +overhead from the mapping functions. In this paper, we propose a novel Focused +Linear Attention module to achieve both high efficiency and expressiveness. +Specifically, we first analyze the factors contributing to the performance +degradation of linear attention from two perspectives: the focus ability and +feature diversity. To overcome these limitations, we introduce a simple yet +effective mapping function and an efficient rank restoration module to enhance +the expressiveness of self-attention while maintaining low computation +complexity. Extensive experiments show that our linear attention module is +applicable to a variety of advanced vision Transformers, and achieves +consistently improved performances on multiple benchmarks. Code is available at +https://github.com/LeapLabTHU/FLatten-Transformer. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Industrial Anomaly Detection with Domain Shift: A Real-world Dataset and + Masked Multi-scale Reconstruction + + +
+ Industrial anomaly detection (IAD) is crucial for automating industrial +quality inspection. The diversity of the datasets is the foundation for +developing comprehensive IAD algorithms. Existing IAD datasets focus on the +diversity of data categories, overlooking the diversity of domains within the +same data category. In this paper, to bridge this gap, we propose the +Aero-engine Blade Anomaly Detection (AeBAD) dataset, consisting of two +sub-datasets: the single-blade dataset and the video anomaly detection dataset +of blades. Compared to existing datasets, AeBAD has the following two +characteristics: 1.) The target samples are not aligned and at different +scales. 2.) There is a domain shift between the distribution of normal samples +in the test set and the training set, where the domain shifts are mainly caused +by the changes in illumination and view. Based on this dataset, we observe that +current state-of-the-art (SOTA) IAD methods exhibit limitations when the domain +of normal samples in the test set undergoes a shift. To address this issue, we +propose a novel method called masked multi-scale reconstruction (MMR), which +enhances the model's capacity to deduce causality among patches in normal +samples by a masked reconstruction task. MMR achieves superior performance +compared to SOTA methods on the AeBAD dataset. Furthermore, MMR achieves +competitive performance with SOTA methods to detect the anomalies of different +types on the MVTec AD dataset. Code and dataset are available at +https://github.com/zhangzilongc/MMR. + +
+
+ comment: Accept by Computers in Industry +
+
+
+
+
+ + ♻ ☆ Improving Differentiable Architecture Search via Self-Distillation + + +
+ Differentiable Architecture Search (DARTS) is a simple yet efficient Neural +Architecture Search (NAS) method. During the search stage, DARTS trains a +supernet by jointly optimizing architecture parameters and network parameters. +During the evaluation stage, DARTS discretizes the supernet to derive the +optimal architecture based on architecture parameters. However, recent research +has shown that during the training process, the supernet tends to converge +towards sharp minima rather than flat minima. This is evidenced by the higher +sharpness of the loss landscape of the supernet, which ultimately leads to a +performance gap between the supernet and the optimal architecture. In this +paper, we propose Self-Distillation Differentiable Neural Architecture Search +(SD-DARTS) to alleviate the discretization gap. We utilize self-distillation to +distill knowledge from previous steps of the supernet to guide its training in +the current step, effectively reducing the sharpness of the supernet's loss and +bridging the performance gap between the supernet and the optimal architecture. +Furthermore, we introduce the concept of voting teachers, where multiple +previous supernets are selected as teachers, and their output probabilities are +aggregated through voting to obtain the final teacher prediction. Experimental +results on real datasets demonstrate the advantages of our novel +self-distillation-based NAS method compared to state-of-the-art alternatives. + +
+
+ comment: Accepted by Neural Networks +
+
+
+
+
+ + ♻ ☆ Improving 3D Imaging with Pre-Trained Perpendicular 2D Diffusion Models ICCV23 + + +
+ Diffusion models have become a popular approach for image generation and +reconstruction due to their numerous advantages. However, most diffusion-based +inverse problem-solving methods only deal with 2D images, and even recently +published 3D methods do not fully exploit the 3D distribution prior. To address +this, we propose a novel approach using two perpendicular pre-trained 2D +diffusion models to solve the 3D inverse problem. By modeling the 3D data +distribution as a product of 2D distributions sliced in different directions, +our method effectively addresses the curse of dimensionality. Our experimental +results demonstrate that our method is highly effective for 3D medical image +reconstruction tasks, including MRI Z-axis super-resolution, compressed sensing +MRI, and sparse-view CT. Our method can generate high-quality voxel volumes +suitable for medical applications. + +
+
+ comment: ICCV23 poster. 15 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ FAM: fast adaptive federated meta-learning + + +
+ In this work, we propose a fast adaptive federated meta-learning (FAM) +framework for collaboratively learning a single global model, which can then be +personalized locally on individual clients. Federated learning enables multiple +clients to collaborate to train a model without sharing data. Clients with +insufficient data or data diversity participate in federated learning to learn +a model with superior performance. Nonetheless, learning suffers when data +distributions diverge. There is a need to learn a global model that can be +adapted using client's specific information to create personalized models on +clients is required. MRI data suffers from this problem, wherein, one, due to +data acquisition challenges, local data at a site is sufficient for training an +accurate model and two, there is a restriction of data sharing due to privacy +concerns and three, there is a need for personalization of a learnt shared +global model on account of domain shift across client sites. The global model +is sparse and captures the common features in the MRI. This skeleton network is +grown on each client to train a personalized model by learning additional +client-specific parameters from local data. Experimental results show that the +personalization process at each client quickly converges using a limited number +of epochs. The personalized client models outperformed the locally trained +models, demonstrating the efficacy of the FAM mechanism. Additionally, the +sparse parameter set to be communicated during federated learning drastically +reduced communication overhead, which makes the scheme viable for networks with +limited resources. + +
+
+ comment: 13 Pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models + + +
+ Zero-shot referring image segmentation is a challenging task because it aims +to find an instance segmentation mask based on the given referring +descriptions, without training on this type of paired data. Current zero-shot +methods mainly focus on using pre-trained discriminative models (e.g., CLIP). +However, we have observed that generative models (e.g., Stable Diffusion) have +potentially understood the relationships between various visual elements and +text descriptions, which are rarely investigated in this task. In this work, we +introduce a novel Referring Diffusional segmentor (Ref-Diff) for this task, +which leverages the fine-grained multi-modal information from generative +models. We demonstrate that without a proposal generator, a generative model +alone can achieve comparable performance to existing SOTA weakly-supervised +models. When we combine both generative and discriminative models, our Ref-Diff +outperforms these competing methods by a significant margin. This indicates +that generative models are also beneficial for this task and can complement +discriminative models for better referring segmentation. Our code is publicly +available at https://github.com/kodenii/Ref-Diff. + +
+
+
+
+
+ + ♻ ☆ MeDM: Mediating Image Diffusion Models for Video-to-Video Translation + with Temporal Correspondence Guidance + + +
+ This study introduces an efficient and effective method, MeDM, that utilizes +pre-trained image Diffusion Models for video-to-video translation with +consistent temporal flow. The proposed framework can render videos from scene +position information, such as a normal G-buffer, or perform text-guided editing +on videos captured in real-world scenarios. We employ explicit optical flows to +construct a practical coding that enforces physical constraints on generated +frames and mediates independent frame-wise scores. By leveraging this coding, +maintaining temporal consistency in the generated videos can be framed as an +optimization problem with a closed-form solution. To ensure compatibility with +Stable Diffusion, we also suggest a workaround for modifying observed-space +scores in latent-space Diffusion Models. Notably, MeDM does not require +fine-tuning or test-time optimization of the Diffusion Models. Through +extensive qualitative, quantitative, and subjective experiments on various +benchmarks, the study demonstrates the effectiveness and superiority of the +proposed approach. Project page can be found at https://medm2023.github.io + +
+
+ comment: Project page: https://medm2023.github.io +
+
+
+
+
+ + ♻ ☆ ComCLIP: Training-Free Compositional Image and Text Matching + + +
+ Contrastive Language-Image Pretraining (CLIP) has demonstrated great +zero-shot performance for matching images and text. However, it is still +challenging to adapt vision-lanaguage pretrained models like CLIP to +compositional image and text matching -- a more challenging image and text +matching task requiring the model understanding of compositional word concepts +and visual components. Towards better compositional generalization in zero-shot +image and text matching, in this paper, we study the problem from a causal +perspective: the erroneous semantics of individual entities are essentially +confounders that cause the matching failure. Therefore, we propose a novel +\textbf{\textit{training-free}} compositional CLIP model (ComCLIP). ComCLIP +disentangles input images into subjects, objects, and action sub-images and +composes CLIP's vision encoder and text encoder to perform evolving matching +over compositional text embedding and sub-image embeddings. In this way, +ComCLIP can mitigate spurious correlations introduced by the pretrained CLIP +models and dynamically evaluate the importance of each component. Experiments +on four compositional image-text matching datasets: SVO, ComVG, Winoground, and +VL-checklist, and two general image-text retrieval datasets: Flick30K, and +MSCOCO demonstrate the effectiveness of our plug-and-play method, which boosts +the \textbf{\textit{zero-shot}} inference ability of CLIP, SLIP, and BLIP2 even +without further training or fine-tuning. + +
+
+
+
+
+ + ♻ ☆ Real-time Strawberry Detection Based on Improved YOLOv5s Architecture + for Robotic Harvesting in open-field environment + + +
+ This study proposed a YOLOv5-based custom object detection model to detect +strawberries in an outdoor environment. The original architecture of the +YOLOv5s was modified by replacing the C3 module with the C2f module in the +backbone network, which provided a better feature gradient flow. Secondly, the +Spatial Pyramid Pooling Fast in the final layer of the backbone network of +YOLOv5s was combined with Cross Stage Partial Net to improve the generalization +ability over the strawberry dataset in this study. The proposed architecture +was named YOLOv5s-Straw. The RGB images dataset of the strawberry canopy with +three maturity classes (immature, nearly mature, and mature) was collected in +open-field environment and augmented through a series of operations including +brightness reduction, brightness increase, and noise adding. To verify the +superiority of the proposed method for strawberry detection in open-field +environment, four competitive detection models (YOLOv3-tiny, YOLOv5s, +YOLOv5s-C2f, and YOLOv8s) were trained, and tested under the same computational +environment and compared with YOLOv5s-Straw. The results showed that the +highest mean average precision of 80.3% was achieved using the proposed +architecture whereas the same was achieved with YOLOv3-tiny, YOLOv5s, +YOLOv5s-C2f, and YOLOv8s were 73.4%, 77.8%, 79.8%, 79.3%, respectively. +Specifically, the average precision of YOLOv5s-Straw was 82.1% in the immature +class, 73.5% in the nearly mature class, and 86.6% in the mature class, which +were 2.3% and 3.7%, respectively, higher than that of the latest YOLOv8s. The +model included 8.6*10^6 network parameters with an inference speed of 18ms per +image while the inference speed of YOLOv8s had a slower inference speed of +21.0ms and heavy parameters of 11.1*10^6, which indicates that the proposed +model is fast enough for real time strawberry detection and localization for +the robotic picking. + +
+
+ comment: 20 pages; 15 figures +
+
+
+
+
+ + ♻ ☆ Mutual-Guided Dynamic Network for Image Fusion + + +
+ Image fusion aims to generate a high-quality image from multiple images +captured under varying conditions. The key problem of this task is to preserve +complementary information while filtering out irrelevant information for the +fused result. However, existing methods address this problem by leveraging +static convolutional neural networks (CNNs), suffering two inherent limitations +during feature extraction, i.e., being unable to handle spatial-variant +contents and lacking guidance from multiple inputs. In this paper, we propose a +novel mutual-guided dynamic network (MGDN) for image fusion, which allows for +effective information utilization across different locations and inputs. +Specifically, we design a mutual-guided dynamic filter (MGDF) for adaptive +feature extraction, composed of a mutual-guided cross-attention (MGCA) module +and a dynamic filter predictor, where the former incorporates additional +guidance from different inputs and the latter generates spatial-variant kernels +for different locations. In addition, we introduce a parallel feature fusion +(PFF) module to effectively fuse local and global information of the extracted +features. To further reduce the redundancy among the extracted features while +simultaneously preserving their shared structural information, we devise a +novel loss function that combines the minimization of normalized mutual +information (NMI) with an estimated gradient mask. Experimental results on five +benchmark datasets demonstrate that our proposed method outperforms existing +methods on four image fusion tasks. The code and model are publicly available +at: https://github.com/Guanys-dar/MGDN. + +
+
+ comment: ACMMM 2023 accepted +
+
+
+
+
+ + ♻ ☆ Variational Denoising Network: Toward Blind Noise Modeling and Removal + + +
+ Blind image denoising is an important yet very challenging problem in +computer vision due to the complicated acquisition process of real images. In +this work we propose a new variational inference method, which integrates both +noise estimation and image denoising into a unique Bayesian framework, for +blind image denoising. Specifically, an approximate posterior, parameterized by +deep neural networks, is presented by taking the intrinsic clean image and +noise variances as latent variables conditioned on the input noisy image. This +posterior provides explicit parametric forms for all its involved +hyper-parameters, and thus can be easily implemented for blind image denoising +with automatic noise estimation for the test noisy image. On one hand, as other +data-driven deep learning methods, our method, namely variational denoising +network (VDN), can perform denoising efficiently due to its explicit form of +posterior expression. On the other hand, VDN inherits the advantages of +traditional model-driven approaches, especially the good generalization +capability of generative models. VDN has good interpretability and can be +flexibly utilized to estimate and remove complicated non-i.i.d. noise collected +in real scenarios. Comprehensive experiments are performed to substantiate the +superiority of our method in blind image denoising. + +
+
+ comment: Correct a minor typo +
+
+
+
+
+ + ♻ ☆ NPC: Neural Point Characters from Video + + +
+ High-fidelity human 3D models can now be learned directly from videos, +typically by combining a template-based surface model with neural +representations. However, obtaining a template surface requires expensive +multi-view capture systems, laser scans, or strictly controlled conditions. +Previous methods avoid using a template but rely on a costly or ill-posed +mapping from observation to canonical space. We propose a hybrid point-based +representation for reconstructing animatable characters that does not require +an explicit surface model, while being generalizable to novel poses. For a +given video, our method automatically produces an explicit set of 3D points +representing approximate canonical geometry, and learns an articulated +deformation model that produces pose-dependent point transformations. The +points serve both as a scaffold for high-frequency neural features and an +anchor for efficiently mapping between observation and canonical space. We +demonstrate on established benchmarks that our representation overcomes +limitations of prior work operating in either canonical or in observation +space. Moreover, our automatic point extraction approach enables learning +models of human and animal characters alike, matching the performance of the +methods using rigged surface templates despite being more general. Project +website: https://lemonatsu.github.io/npc/ + +
+
+ comment: Project website: https://lemonatsu.github.io/npc/ +
+
+
+
+
+ + ♻ ☆ High-Resolution Document Shadow Removal via A Large-Scale Real-World + Dataset and A Frequency-Aware Shadow Erasing Net ICCV + 2023 + + +
+ Shadows often occur when we capture the documents with casual equipment, +which influences the visual quality and readability of the digital copies. +Different from the algorithms for natural shadow removal, the algorithms in +document shadow removal need to preserve the details of fonts and figures in +high-resolution input. Previous works ignore this problem and remove the +shadows via approximate attention and small datasets, which might not work in +real-world situations. We handle high-resolution document shadow removal +directly via a larger-scale real-world dataset and a carefully designed +frequency-aware network. As for the dataset, we acquire over 7k couples of +high-resolution (2462 x 3699) images of real-world document pairs with various +samples under different lighting circumstances, which is 10 times larger than +existing datasets. As for the design of the network, we decouple the +high-resolution images in the frequency domain, where the low-frequency details +and high-frequency boundaries can be effectively learned via the carefully +designed network structure. Powered by our network and dataset, the proposed +method clearly shows a better performance than previous methods in terms of +visual quality and numerical results. The code, models, and dataset are +available at: https://github.com/CXH-Research/DocShadow-SD7K + +
+
+ comment: Accepted by International Conference on Computer Vision 2023 (ICCV + 2023) +
+
+
+
+
+ + ♻ ☆ SpectralDiff: A Generative Framework for Hyperspectral Image + Classification with Diffusion Models + + +
+ Hyperspectral Image (HSI) classification is an important issue in remote +sensing field with extensive applications in earth science. In recent years, a +large number of deep learning-based HSI classification methods have been +proposed. However, existing methods have limited ability to handle +high-dimensional, highly redundant, and complex data, making it challenging to +capture the spectral-spatial distributions of data and relationships between +samples. To address this issue, we propose a generative framework for HSI +classification with diffusion models (SpectralDiff) that effectively mines the +distribution information of high-dimensional and highly redundant data by +iteratively denoising and explicitly constructing the data generation process, +thus better reflecting the relationships between samples. The framework +consists of a spectral-spatial diffusion module, and an attention-based +classification module. The spectral-spatial diffusion module adopts forward and +reverse spectral-spatial diffusion processes to achieve adaptive construction +of sample relationships without requiring prior knowledge of graphical +structure or neighborhood information. It captures spectral-spatial +distribution and contextual information of objects in HSI and mines +unsupervised spectral-spatial diffusion features within the reverse diffusion +process. Finally, these features are fed into the attention-based +classification module for per-pixel classification. The diffusion features can +facilitate cross-sample perception via reconstruction distribution, leading to +improved classification performance. Experiments on three public HSI datasets +demonstrate that the proposed method can achieve better performance than +state-of-the-art methods. For the sake of reproducibility, the source code of +SpectralDiff will be publicly available at +https://github.com/chenning0115/SpectralDiff. + +
+
+
+
+
+ + ♻ ☆ Robust Principles: Architectural Design Principles for Adversarially + Robust CNNs BMVC'23 + + +
+ Our research aims to unify existing works' diverging opinions on how +architectural components affect the adversarial robustness of CNNs. To +accomplish our goal, we synthesize a suite of three generalizable robust +architectural design principles: (a) optimal range for depth and width +configurations, (b) preferring convolutional over patchify stem stage, and (c) +robust residual block design through adopting squeeze and excitation blocks and +non-parametric smooth activation functions. Through extensive experiments +across a wide spectrum of dataset scales, adversarial training methods, model +parameters, and network design spaces, our principles consistently and markedly +improve AutoAttack accuracy: 1-3 percentage points (pp) on CIFAR-10 and +CIFAR-100, and 4-9 pp on ImageNet. The code is publicly available at +https://github.com/poloclub/robust-principles. + +
+
+ comment: Published at BMVC'23 +
+
+
+
+
+ + ♻ ☆ BPKD: Boundary Privileged Knowledge Distillation For Semantic + Segmentation + + +
+ Current knowledge distillation approaches in semantic segmentation tend to +adopt a holistic approach that treats all spatial locations equally. However, +for dense prediction, students' predictions on edge regions are highly +uncertain due to contextual information leakage, requiring higher spatial +sensitivity knowledge than the body regions. To address this challenge, this +paper proposes a novel approach called boundary-privileged knowledge +distillation (BPKD). BPKD distills the knowledge of the teacher model's body +and edges separately to the compact student model. Specifically, we employ two +distinct loss functions: (i) edge loss, which aims to distinguish between +ambiguous classes at the pixel level in edge regions; (ii) body loss, which +utilizes shape constraints and selectively attends to the inner-semantic +regions. Our experiments demonstrate that the proposed BPKD method provides +extensive refinements and aggregation for edge and body regions. Additionally, +the method achieves state-of-the-art distillation performance for semantic +segmentation on three popular benchmark datasets, highlighting its +effectiveness and generalization ability. BPKD shows consistent improvements +across a diverse array of lightweight segmentation structures, including both +CNNs and transformers, underscoring its architecture-agnostic adaptability. The +code is available at \url{https://github.com/AkideLiu/BPKD}. + +
+
+ comment: 17 pages, 9 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction ICCV2023 + + +
+ Vectorized high-definition map online construction has garnered considerable +attention in the field of autonomous driving research. Most existing approaches +model changeable map elements using a fixed number of points, or predict local +maps in a two-stage autoregressive manner, which may miss essential details and +lead to error accumulation. Towards precise map element learning, we propose a +simple yet effective architecture named PivotNet, which adopts unified +pivot-based map representations and is formulated as a direct set prediction +paradigm. Concretely, we first propose a novel point-to-line mask module to +encode both the subordinate and geometrical point-line priors in the network. +Then, a well-designed pivot dynamic matching module is proposed to model the +topology in dynamic point sequences by introducing the concept of sequence +matching. Furthermore, to supervise the position and topology of the vectorized +point predictions, we propose a dynamic vectorized sequence loss. Extensive +experiments and ablations show that PivotNet is remarkably superior to other +SOTAs by 5.9 mAP at least. The code will be available soon. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Self-Sampling Meta SAM: Enhancing Few-shot Medical Image Segmentation + with Meta-Learning + + +
+ While the Segment Anything Model (SAM) excels in semantic segmentation for +general-purpose images, its performance significantly deteriorates when applied +to medical images, primarily attributable to insufficient representation of +medical images in its training dataset. Nonetheless, gathering comprehensive +datasets and training models that are universally applicable is particularly +challenging due to the long-tail problem common in medical images. To address +this gap, here we present a Self-Sampling Meta SAM (SSM-SAM) framework for +few-shot medical image segmentation. Our innovation lies in the design of three +key modules: 1) An online fast gradient descent optimizer, further optimized by +a meta-learner, which ensures swift and robust adaptation to new tasks. 2) A +Self-Sampling module designed to provide well-aligned visual prompts for +improved attention allocation; and 3) A robust attention-based decoder +specifically designed for medical few-shot learning to capture relationship +between different slices. Extensive experiments on a popular abdominal CT +dataset and an MRI dataset demonstrate that the proposed method achieves +significant improvements over state-of-the-art methods in few-shot +segmentation, with an average improvements of 10.21% and 1.80% in terms of DSC, +respectively. In conclusion, we present a novel approach for rapid online +adaptation in interactive image segmentation, adapting to a new organ in just +0.83 minutes. Code is publicly available on GitHub upon acceptance. + +
+
+
+
+
+ + ♻ ☆ EasyNet: An Easy Network for 3D Industrial Anomaly Detection + + +
+ 3D anomaly detection is an emerging and vital computer vision task in +industrial manufacturing (IM). Recently many advanced algorithms have been +published, but most of them cannot meet the needs of IM. There are several +disadvantages: i) difficult to deploy on production lines since their +algorithms heavily rely on large pre-trained models; ii) hugely increase +storage overhead due to overuse of memory banks; iii) the inference speed +cannot be achieved in real-time. To overcome these issues, we propose an easy +and deployment-friendly network (called EasyNet) without using pre-trained +models and memory banks: firstly, we design a multi-scale multi-modality +feature encoder-decoder to accurately reconstruct the segmentation maps of +anomalous regions and encourage the interaction between RGB images and depth +images; secondly, we adopt a multi-modality anomaly segmentation network to +achieve a precise anomaly map; thirdly, we propose an attention-based +information entropy fusion module for feature fusion during inference, making +it suitable for real-time deployment. Extensive experiments show that EasyNet +achieves an anomaly detection AUROC of 92.6% without using pre-trained models +and memory banks. In addition, EasyNet is faster than existing methods, with a +high frame rate of 94.55 FPS on a Tesla V100 GPU. + +
+
+
+
+
+ + ♻ ☆ DetOFA: Efficient Training of Once-for-All Networks for Object Detection + by Using Pre-trained Supernet and Path Filter ICCV + + +
+ We address the challenge of training a large supernet for the object +detection task, using a relatively small amount of training data. Specifically, +we propose an efficient supernet-based neural architecture search (NAS) method +that uses transfer learning and search space pruning. First, the supernet is +pre-trained on a classification task, for which large datasets are available. +Second, the search space defined by the supernet is pruned by removing +candidate models that are predicted to perform poorly. To effectively remove +the candidates over a wide range of resource constraints, we particularly +design a performance predictor, called path filter, which can accurately +predict the relative performance of the models that satisfy similar resource +constraints. Hence, supernet training is more focused on the best-performing +candidates. Our path filter handles prediction for paths with different +resource budgets. Compared to once-for-all, our proposed method reduces the +computational cost of the optimal network architecture by 30% and 63%, while +yielding better accuracy-floating point operations Pareto front (0.85 and 0.45 +points of improvement on average precision for Pascal VOC and COCO, +respectively). + +
+
+ comment: Accepted to ICCV workshop 2023 +
+
+
+
+
+ + ♻ ☆ Temporal-Distributed Backdoor Attack Against Video Based Action + Recognition + + +
+ Deep neural networks (DNNs) have achieved tremendous success in various +applications including video action recognition, yet remain vulnerable to +backdoor attacks (Trojans). The backdoor-compromised model will mis-classify to +the target class chosen by the attacker when a test instance (from a non-target +class) is embedded with a specific trigger, while maintaining high accuracy on +attack-free instances. Although there are extensive studies on backdoor attacks +against image data, the susceptibility of video-based systems under backdoor +attacks remains largely unexplored. Current studies are direct extensions of +approaches proposed for image data, e.g., the triggers are independently +embedded within the frames, which tend to be detectable by existing defenses. +In this paper, we introduce a simple yet effective backdoor attack against +video data. Our proposed attack, adding perturbations in a transformed domain, +plants an imperceptible, temporally distributed trigger across the video +frames, and is shown to be resilient to existing defensive strategies. The +effectiveness of the proposed attack is demonstrated by extensive experiments +with various well-known models on two video recognition benchmarks, UCF101 and +HMDB51, and a sign language recognition benchmark, Greek Sign Language (GSL) +dataset. We delve into the impact of several influential factors on our +proposed attack and identify an intriguing effect termed "collateral damage" +through extensive studies. + +
+
+
+
+
+
+
+
+ + Information Retrieval 8 + +
+
+
+ + ☆ NeMig -- A Bilingual News Collection and Knowledge Graph about Migration RecSys 2023 + + +
+ News recommendation plays a critical role in shaping the public's worldviews +through the way in which it filters and disseminates information about +different topics. Given the crucial impact that media plays in opinion +formation, especially for sensitive topics, understanding the effects of +personalized recommendation beyond accuracy has become essential in today's +digital society. In this work, we present NeMig, a bilingual news collection on +the topic of migration, and corresponding rich user data. In comparison to +existing news recommendation datasets, which comprise a large variety of +monolingual news, NeMig covers articles on a single controversial topic, +published in both Germany and the US. We annotate the sentiment polarization of +the articles and the political leanings of the media outlets, in addition to +extracting subtopics and named entities disambiguated through Wikidata. These +features can be used to analyze the effects of algorithmic news curation beyond +accuracy-based performance, such as recommender biases and the creation of +filter bubbles. We construct domain-specific knowledge graphs from the news +text and metadata, thus encoding knowledge-level connections between articles. +Importantly, while existing datasets include only click behavior, we collect +user socio-demographic and political information in addition to explicit click +feedback. We demonstrate the utility of NeMig through experiments on the tasks +of news recommenders benchmarking, analysis of biases in recommenders, and news +trends analysis. NeMig aims to provide a useful resource for the news +recommendation community and to foster interdisciplinary research into the +multidimensional effects of algorithmic news curation. + +
+
+ comment: Accepted at the 11th International Workshop on News Recommendation + and Analytics (INRA 2023) in conjunction with ACM RecSys 2023 +
+
+
+
+
+ + ☆ General and Practical Tuning Method for Off-the-Shelf Graph-Based Index: + SISAP Indexing Challenge Report by Team UTokyo + + +
+ Despite the efficacy of graph-based algorithms for Approximate Nearest +Neighbor (ANN) searches, the optimal tuning of such systems remains unclear. +This study introduces a method to tune the performance of off-the-shelf +graph-based indexes, focusing on the dimension of vectors, database size, and +entry points of graph traversal. We utilize a black-box optimization algorithm +to perform integrated tuning to meet the required levels of recall and Queries +Per Second (QPS). We applied our approach to Task A of the SISAP 2023 Indexing +Challenge and got second place in the 10M and 30M tracks. It improves +performance substantially compared to brute force methods. This research offers +a universally applicable tuning method for graph-based indexes, extending +beyond the specific conditions of the competition to broader uses. + +
+
+ comment: Accepted paper on 2nd place solution of SISAP 2023 Indexing Challenge + Task A +
+
+
+
+
+ + ☆ Explainable Active Learning for Preference Elicitation + + +
+ Gaining insights into the preferences of new users and subsequently +personalizing recommendations necessitate managing user interactions +intelligently, namely, posing pertinent questions to elicit valuable +information effectively. In this study, our focus is on a specific scenario of +the cold-start problem, where the recommendation system lacks adequate user +presence or access to other users' data is restricted, obstructing employing +user profiling methods utilizing existing data in the system. We employ Active +Learning (AL) to solve the addressed problem with the objective of maximizing +information acquisition with minimal user effort. AL operates for selecting +informative data from a large unlabeled set to inquire an oracle to label them +and eventually updating a machine learning (ML) model. We operate AL in an +integrated process of unsupervised, semi-supervised, and supervised ML within +an explanatory preference elicitation process. It harvests user feedback (given +for the system's explanations on the presented items) over informative samples +to update an underlying ML model estimating user preferences. The designed user +interaction facilitates personalizing the system by incorporating user feedback +into the ML model and also enhances user trust by refining the system's +explanations on recommendations. We implement the proposed preference +elicitation methodology for food recommendation. We conducted human experiments +to assess its efficacy in the short term and also experimented with several AL +strategies over synthetic user profiles that we created for two food datasets, +aiming for long-term performance analysis. The experimental results demonstrate +the efficiency of the proposed preference elicitation with limited user-labeled +data while also enhancing user trust through accurate explanations. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Towards Contrastive Learning in Music Video Domain + + +
+ Contrastive learning is a powerful way of learning multimodal representations +across various domains such as image-caption retrieval and audio-visual +representation learning. In this work, we investigate if these findings +generalize to the domain of music videos. Specifically, we create a dual +en-coder for the audio and video modalities and train it using a bidirectional +contrastive loss. For the experiments, we use an industry dataset containing +550 000 music videos as well as the public Million Song Dataset, and evaluate +the quality of learned representations on the downstream tasks of music tagging +and genre classification. Our results indicate that pre-trained networks +without contrastive fine-tuning outperform our contrastive learning approach +when evaluated on both tasks. To gain a better understanding of the reasons +contrastive learning was not successful for music videos, we perform a +qualitative analysis of the learned representations, revealing why contrastive +learning might have difficulties uniting embeddings from two modalities. Based +on these findings, we outline possible directions for future work. To +facilitate the reproducibility of our results, we share our code and the +pre-trained model. + +
+
+ comment: 6 pages, 2 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Recommender AI Agent: Integrating Large Language Models for Interactive + Recommendations + + +
+ Recommender models excel at providing domain-specific item recommendations by +leveraging extensive user behavior data. Despite their ability to act as +lightweight domain experts, they struggle to perform versatile tasks such as +providing explanations and engaging in conversations. On the other hand, large +language models (LLMs) represent a significant step towards artificial general +intelligence, showcasing remarkable capabilities in instruction comprehension, +commonsense reasoning, and human interaction. However, LLMs lack the knowledge +of domain-specific item catalogs and behavioral patterns, particularly in areas +that diverge from general world knowledge, such as online e-commerce. +Finetuning LLMs for each domain is neither economic nor efficient. + In this paper, we bridge the gap between recommender models and LLMs, +combining their respective strengths to create a versatile and interactive +recommender system. We introduce an efficient framework called InteRecAgent, +which employs LLMs as the brain and recommender models as tools. We first +outline a minimal set of essential tools required to transform LLMs into +InteRecAgent. We then propose an efficient workflow within InteRecAgent for +task execution, incorporating key components such as a memory bus, dynamic +demonstration-augmented task planning, and reflection. InteRecAgent enables +traditional recommender systems, such as those ID-based matrix factorization +models, to become interactive systems with a natural language interface through +the integration of LLMs. Experimental results on several public datasets show +that InteRecAgent achieves satisfying performance as a conversational +recommender system, outperforming general-purpose LLMs. + +
+
+ comment: 16 pages, 15 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ GNNUERS: Fairness Explanation in GNNs for Recommendation via + Counterfactual Reasoning + + +
+ Nowadays, research into personalization has been focusing on explainability +and fairness. Several approaches proposed in recent works are able to explain +individual recommendations in a post-hoc manner or by explanation paths. +However, explainability techniques applied to unfairness in recommendation have +been limited to finding user/item features mostly related to biased +recommendations. In this paper, we devised a novel algorithm that leverages +counterfactuality methods to discover user unfairness explanations in the form +of user-item interactions. In our counterfactual framework, interactions are +represented as edges in a bipartite graph, with users and items as nodes. Our +Bipartite Graph Explainer perturbs the topological structure to find an altered +version (counterfactual explanation) that minimizes the disparity in utility +between the protected and unprotected demographic groups. Experiments on four +real-world graphs coming from various domains showed that our method can +systematically explain user unfairness on three state-of-the-art GNN-based +recommendation models. Moreover, an empirical evaluation of the perturbed +network uncovered relevant patterns that justify the nature of the unfairness +discovered by the generated explanations. The source code and the preprocessed +data sets are available at https://github.com/jackmedda/RS-BGExplainer. + +
+
+
+
+
+ + ♻ ☆ Test Time Embedding Normalization for Popularity Bias Mitigation CIKM 2023 + + +
+ Popularity bias is a widespread problem in the field of recommender systems, +where popular items tend to dominate recommendation results. In this work, we +propose 'Test Time Embedding Normalization' as a simple yet effective strategy +for mitigating popularity bias, which surpasses the performance of the previous +mitigation approaches by a significant margin. Our approach utilizes the +normalized item embedding during the inference stage to control the influence +of embedding magnitude, which is highly correlated with item popularity. +Through extensive experiments, we show that our method combined with the +sampled softmax loss effectively reduces popularity bias compare to previous +approaches for bias mitigation. We further investigate the relationship between +user and item embeddings and find that the angular similarity between +embeddings distinguishes preferable and non-preferable items regardless of +their popularity. The analysis explains the mechanism behind the success of our +approach in eliminating the impact of popularity bias. Our code is available at +https://github.com/ml-postech/TTEN. + +
+
+ comment: 5 pages, CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Sparseness-constrained Nonnegative Tensor Factorization for Detecting + Topics at Different Time Scales + + +
+ Temporal data (such as news articles or Twitter feeds) often consists of a +mixture of long-lasting trends and popular but short-lasting topics of +interest. A truly successful topic modeling strategy should be able to detect +both types of topics and clearly locate them in time. In this paper, we first +show that nonnegative CANDECOMP/PARAFAC decomposition (NCPD) is able to +discover topics of variable persistence automatically. Then, we propose +sparseness-constrained NCPD (S-NCPD) and its online variant in order to +actively control the length of the learned topics effectively and efficiently. +Further, we propose quantitative ways to measure the topic length and +demonstrate the ability of S-NCPD (as well as its online variant) to discover +short and long-lasting temporal topics in a controlled manner in semi-synthetic +and real-world data including news headlines. We also demonstrate that the +online variant of S-NCPD reduces the reconstruction error more rapidly than +S-NCPD. + +
+
+
+
+
+
+
+
+ + Machine Learning 92 + +
+
+
+ + ☆ Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D + Understanding, Generation, and Instruction Following + + +
+ We introduce Point-Bind, a 3D multi-modality model aligning point clouds with +2D image, language, audio, and video. Guided by ImageBind, we construct a joint +embedding space between 3D and multi-modalities, enabling many promising +applications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D +open-world understanding. On top of this, we further present Point-LLM, the +first 3D large language model (LLM) following 3D multi-modal instructions. By +parameter-efficient fine-tuning techniques, Point-LLM injects the semantics of +Point-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction +data, but exhibits superior 3D and multi-modal question-answering capacity. We +hope our work may cast a light on the community for extending 3D point clouds +to multi-modality applications. Code is available at +https://github.com/ZiyuGuo99/Point-Bind_Point-LLM. + +
+
+ comment: Work in progress. Code is available at + https://github.com/ZiyuGuo99/Point-Bind_Point-LLM +
+
+
+
+
+ + ☆ Baseline Defenses for Adversarial Attacks Against Aligned Language + Models + + +
+ As Large Language Models quickly become ubiquitous, their security +vulnerabilities are critical to understand. Recent work shows that text +optimizers can produce jailbreaking prompts that bypass moderation and +alignment. Drawing from the rich body of work on adversarial machine learning, +we approach these attacks with three questions: What threat models are +practically useful in this domain? How do baseline defense techniques perform +in this new domain? How does LLM security differ from computer vision? + We evaluate several baseline defense strategies against leading adversarial +attacks on LLMs, discussing the various settings in which each is feasible and +effective. Particularly, we look at three types of defenses: detection +(perplexity based), input preprocessing (paraphrase and retokenization), and +adversarial training. We discuss white-box and gray-box settings and discuss +the robustness-performance trade-off for each of the defenses considered. +Surprisingly, we find much more success with filtering and preprocessing than +we would expect from other domains, such as vision, providing a first +indication that the relative strengths of these defenses may be weighed +differently in these domains. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Iterative Multi-granular Image Editing using Diffusion Models + + +
+ Recent advances in text-guided image synthesis has dramatically changed how +creative professionals generate artistic and aesthetically pleasing visual +assets. To fully support such creative endeavors, the process should possess +the ability to: 1) iteratively edit the generations and 2) control the spatial +reach of desired changes (global, local or anything in between). We formalize +this pragmatic problem setting as Iterative Multi-granular Editing. While there +has been substantial progress with diffusion-based models for image synthesis +and editing, they are all one shot (i.e., no iterative editing capabilities) +and do not naturally yield multi-granular control (i.e., covering the full +spectrum of local-to-global edits). To overcome these drawbacks, we propose +EMILIE: Iterative Multi-granular Image Editor. EMILIE introduces a novel latent +iteration strategy, which re-purposes a pre-trained diffusion model to +facilitate iterative editing. This is complemented by a gradient control +operation for multi-granular control. We introduce a new benchmark dataset to +evaluate our newly proposed setting. We conduct exhaustive quantitatively and +qualitatively evaluation against recent state-of-the-art approaches adapted to +our task, to being out the mettle of EMILIE. We hope our work would attract +attention to this newly identified, pragmatic problem setting. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ☆ Bayesian deep learning for cosmic volumes with modified gravity + + +
+ The new generation of galaxy surveys will provide unprecedented data allowing +us to test gravity at cosmological scales. A robust cosmological analysis of +the large-scale structure demands exploiting the nonlinear information encoded +in the cosmic web. Machine Learning techniques provide such tools, however, do +not provide a priori assessment of uncertainties. This study aims at extracting +cosmological parameters from modified gravity (MG) simulations through deep +neural networks endowed with uncertainty estimations. We implement Bayesian +neural networks (BNNs) with an enriched approximate posterior distribution +considering two cases: one with a single Bayesian last layer (BLL), and another +one with Bayesian layers at all levels (FullB). We train both BNNs with +real-space density fields and power-spectra from a suite of 2000 dark matter +only particle mesh $N$-body simulations including modified gravity models +relying on MG-PICOLA covering 256 $h^{-1}$ Mpc side cubical volumes with +128$^3$ particles. BNNs excel in accurately predicting parameters for +$\Omega_m$ and $\sigma_8$ and their respective correlation with the MG +parameter. We find out that BNNs yield well-calibrated uncertainty estimates +overcoming the over- and under-estimation issues in traditional neural +networks. We observe that the presence of MG parameter leads to a significant +degeneracy with $\sigma_8$ being one of the possible explanations of the poor +MG predictions. Ignoring MG, we obtain a deviation of the relative errors in +$\Omega_m$ and $\sigma_8$ by at least $30\%$. Moreover, we report consistent +results from the density field and power spectra analysis, and comparable +results between BLL and FullB experiments which permits us to save computing +time by a factor of two. This work contributes in setting the path to extract +cosmological parameters from complete small cosmic volumes towards the highly +nonlinear regime. + +
+
+ comment: 13 pages, 7 figures and 7 tables +
+
+
+
+
+ + ☆ Copiloting the Copilots: Fusing Large Language Models with Completion + Engines for Automated Program Repair + + +
+ During Automated Program Repair (APR), it can be challenging to synthesize +correct patches for real-world systems in general-purpose programming +languages. Recent Large Language Models (LLMs) have been shown to be helpful +"copilots" in assisting developers with various coding tasks, and have also +been directly applied for patch synthesis. However, most LLMs treat programs as +sequences of tokens, meaning that they are ignorant of the underlying semantics +constraints of the target programming language. This results in plenty of +statically invalid generated patches, impeding the practicality of the +technique. Therefore, we propose Repilot, a framework to further copilot the AI +"copilots" (i.e., LLMs) by synthesizing more valid patches during the repair +process. Our key insight is that many LLMs produce outputs autoregressively +(i.e., token by token), resembling human writing programs, which can be +significantly boosted and guided through a Completion Engine. Repilot +synergistically synthesizes a candidate patch through the interaction between +an LLM and a Completion Engine, which 1) prunes away infeasible tokens +suggested by the LLM and 2) proactively completes the token based on the +suggestions provided by the Completion Engine. Our evaluation on a subset of +the widely-used Defects4j 1.2 and 2.0 datasets shows that Repilot fixes 66 and +50 bugs, respectively, surpassing the best-performing baseline by 14 and 16 +bugs fixed. More importantly, Repilot is capable of producing more valid and +correct patches than the base LLM when given the same generation budget. + +
+
+
+
+
+ + ☆ Fast and Regret Optimal Best Arm Identification: Fundamental Limits and + Low-Complexity Algorithms + + +
+ This paper considers a stochastic multi-armed bandit (MAB) problem with dual +objectives: (i) quick identification and commitment to the optimal arm, and +(ii) reward maximization throughout a sequence of $T$ consecutive rounds. +Though each objective has been individually well-studied, i.e., best arm +identification for (i) and regret minimization for (ii), the simultaneous +realization of both objectives remains an open problem, despite its practical +importance. This paper introduces \emph{Regret Optimal Best Arm Identification} +(ROBAI) which aims to achieve these dual objectives. To solve ROBAI with both +pre-determined stopping time and adaptive stopping time requirements, we +present the $\mathsf{EOCP}$ algorithm and its variants respectively, which not +only achieve asymptotic optimal regret in both Gaussian and general bandits, +but also commit to the optimal arm in $\mathcal{O}(\log T)$ rounds with +pre-determined stopping time and $\mathcal{O}(\log^2 T)$ rounds with adaptive +stopping time. We further characterize lower bounds on the commitment time +(equivalent to sample complexity) of ROBAI, showing that $\mathsf{EOCP}$ and +its variants are sample optimal with pre-determined stopping time, and almost +sample optimal with adaptive stopping time. Numerical results confirm our +theoretical analysis and reveal an interesting ``over-exploration'' phenomenon +carried by classic $\mathsf{UCB}$ algorithms, such that $\mathsf{EOCP}$ has +smaller regret even though it stops exploration much earlier than +$\mathsf{UCB}$ ($\mathcal{O}(\log T)$ versus $\mathcal{O}(T)$), which suggests +over-exploration is unnecessary and potentially harmful to system performance. + +
+
+
+
+
+ + ☆ PolyGET: Accelerating Polymer Simulations by Accurate and Generalizable + Forcefield with Equivariant Transformer + + +
+ Polymer simulation with both accuracy and efficiency is a challenging task. +Machine learning (ML) forcefields have been developed to achieve both the +accuracy of ab initio methods and the efficiency of empirical force fields. +However, existing ML force fields are usually limited to single-molecule +settings, and their simulations are not robust enough. In this paper, we +present PolyGET, a new framework for Polymer Forcefields with Generalizable +Equivariant Transformers. PolyGET is designed to capture complex quantum +interactions between atoms and generalize across various polymer families, +using a deep learning model called Equivariant Transformers. We propose a new +training paradigm that focuses exclusively on optimizing forces, which is +different from existing methods that jointly optimize forces and energy. This +simple force-centric objective function avoids competing objectives between +energy and forces, thereby allowing for learning a unified forcefield ML model +over different polymer families. We evaluated PolyGET on a large-scale dataset +of 24 distinct polymer types and demonstrated state-of-the-art performance in +force accuracy and robust MD simulations. Furthermore, PolyGET can simulate +large polymers with high fidelity to the reference ab initio DFT method while +being able to generalize to unseen polymers. + +
+
+
+
+
+ + ☆ Laminar: A New Serverless Stream-based Framework with Semantic Code + Search and Code Completion + + +
+ This paper introduces Laminar, a novel serverless framework based on +dispel4py, a parallel stream-based dataflow library. Laminar efficiently +manages streaming workflows and components through a dedicated registry, +offering a seamless serverless experience. Leveraging large lenguage models, +Laminar enhances the framework with semantic code search, code summarization, +and code completion. This contribution enhances serverless computing by +simplifying the execution of streaming computations, managing data streams more +efficiently, and offering a valuable tool for both researchers and +practitioners. + +
+
+ comment: 13 pages, 10 Figures, 6 Tables +
+
+
+
+
+ + ☆ Geometry-Informed Neural Operator for Large-Scale 3D PDEs + + +
+ We propose the geometry-informed neural operator (GINO), a highly efficient +approach to learning the solution operator of large-scale partial differential +equations with varying geometries. GINO uses a signed distance function and +point-cloud representations of the input shape and neural operators based on +graph and Fourier architectures to learn the solution operator. The graph +neural operator handles irregular grids and transforms them into and from +regular latent grids on which Fourier neural operator can be efficiently +applied. GINO is discretization-convergent, meaning the trained model can be +applied to arbitrary discretization of the continuous domain and it converges +to the continuum operator as the discretization is refined. To empirically +validate the performance of our method on large-scale simulation, we generate +the industry-standard aerodynamics dataset of 3D vehicle geometries with +Reynolds numbers as high as five million. For this large-scale 3D fluid +simulation, numerical methods are expensive to compute surface pressure. We +successfully trained GINO to predict the pressure on car surfaces using only +five hundred data points. The cost-accuracy experiments show a $26,000 \times$ +speed-up compared to optimized GPU-based computational fluid dynamics (CFD) +simulators on computing the drag coefficient. When tested on new combinations +of geometries and boundary conditions (inlet velocities), GINO obtains a +one-fourth reduction in error rate compared to deep neural network approaches. + +
+
+
+
+
+ + ☆ Consistency of Lloyd's Algorithm Under Perturbations + + +
+ In the context of unsupervised learning, Lloyd's algorithm is one of the most +widely used clustering algorithms. It has inspired a plethora of work +investigating the correctness of the algorithm under various settings with +ground truth clusters. In particular, in 2016, Lu and Zhou have shown that the +mis-clustering rate of Lloyd's algorithm on $n$ independent samples from a +sub-Gaussian mixture is exponentially bounded after $O(\log(n))$ iterations, +assuming proper initialization of the algorithm. However, in many applications, +the true samples are unobserved and need to be learned from the data via +pre-processing pipelines such as spectral methods on appropriate data matrices. +We show that the mis-clustering rate of Lloyd's algorithm on perturbed samples +from a sub-Gaussian mixture is also exponentially bounded after $O(\log(n))$ +iterations under the assumptions of proper initialization and that the +perturbation is small relative to the sub-Gaussian noise. In canonical settings +with ground truth clusters, we derive bounds for algorithms such as +$k$-means$++$ to find good initializations and thus leading to the correctness +of clustering via the main result. We show the implications of the results for +pipelines measuring the statistical significance of derived clusters from data +such as SigClust. We use these general results to derive implications in +providing theoretical guarantees on the misclustering rate for Lloyd's +algorithm in a host of applications, including high-dimensional time series, +multi-dimensional scaling, and community detection for sparse networks via +spectral clustering. + +
+
+ comment: Preprint version 1 +
+
+
+
+
+ + ☆ Mechanism of feature learning in convolutional neural networks + + +
+ Understanding the mechanism of how convolutional neural networks learn +features from image data is a fundamental problem in machine learning and +computer vision. In this work, we identify such a mechanism. We posit the +Convolutional Neural Feature Ansatz, which states that covariances of filters +in any convolutional layer are proportional to the average gradient outer +product (AGOP) taken with respect to patches of the input to that layer. We +present extensive empirical evidence for our ansatz, including identifying high +correlation between covariances of filters and patch-based AGOPs for +convolutional layers in standard neural architectures, such as AlexNet, VGG, +and ResNets pre-trained on ImageNet. We also provide supporting theoretical +evidence. We then demonstrate the generality of our result by using the +patch-based AGOP to enable deep feature learning in convolutional kernel +machines. We refer to the resulting algorithm as (Deep) ConvRFM and show that +our algorithm recovers similar features to deep convolutional networks +including the notable emergence of edge detectors. Moreover, we find that Deep +ConvRFM overcomes previously identified limitations of convolutional kernels, +such as their inability to adapt to local signals in images and, as a result, +leads to sizable performance improvement over fixed convolutional kernels. + +
+
+
+
+
+ + ☆ Amyloid-Beta Axial Plane PET Synthesis from Structural MRI: An Image + Translation Approach for Screening Alzheimer's Disease + + +
+ In this work, an image translation model is implemented to produce synthetic +amyloid-beta PET images from structural MRI that are quantitatively accurate. +Image pairs of amyloid-beta PET and structural MRI were used to train the +model. We found that the synthetic PET images could be produced with a high +degree of similarity to truth in terms of shape, contrast and overall high SSIM +and PSNR. This work demonstrates that performing structural to quantitative +image translation is feasible to enable the access amyloid-beta information +from only MRI. + +
+
+ comment: Abstract submitted and presented to the International Society of + Magnetic Resonance in Medicine (ISMRM 2023), Toronto, Canada +
+
+
+
+
+ + ☆ Interpretation of High-Dimensional Linear Regression: Effects of + Nullspace and Regularization Demonstrated on Battery Data + + +
+ High-dimensional linear regression is important in many scientific fields. +This article considers discrete measured data of underlying smooth latent +processes, as is often obtained from chemical or biological systems. +Interpretation in high dimensions is challenging because the nullspace and its +interplay with regularization shapes regression coefficients. The data's +nullspace contains all coefficients that satisfy $\mathbf{Xw}=\mathbf{0}$, thus +allowing very different coefficients to yield identical predictions. We +developed an optimization formulation to compare regression coefficients and +coefficients obtained by physical engineering knowledge to understand which +part of the coefficient differences are close to the nullspace. This nullspace +method is tested on a synthetic example and lithium-ion battery data. The case +studies show that regularization and z-scoring are design choices that, if +chosen corresponding to prior physical knowledge, lead to interpretable +regression results. Otherwise, the combination of the nullspace and +regularization hinders interpretability and can make it impossible to obtain +regression coefficients close to the true coefficients when there is a true +underlying linear model. Furthermore, we demonstrate that regression methods +that do not produce coefficients orthogonal to the nullspace, such as fused +lasso, can improve interpretability. In conclusion, the insights gained from +the nullspace perspective help to make informed design choices for building +regression models on high-dimensional data and reasoning about potential +underlying linear models, which are important for system optimization and +improving scientific understanding. + +
+
+ comment: Manuscript: 14 pages, 7 figures; Supplementary Information: 4 pages, + 2 figures; Code available: https://github.com/JoachimSchaeffer/HDRegAnalytics +
+
+
+
+
+ + ☆ Interactive and Concentrated Differential Privacy for Bandits + + +
+ Bandits play a crucial role in interactive learning schemes and modern +recommender systems. However, these systems often rely on sensitive user data, +making privacy a critical concern. This paper investigates privacy in bandits +with a trusted centralized decision-maker through the lens of interactive +Differential Privacy (DP). While bandits under pure $\epsilon$-global DP have +been well-studied, we contribute to the understanding of bandits under zero +Concentrated DP (zCDP). We provide minimax and problem-dependent lower bounds +on regret for finite-armed and linear bandits, which quantify the cost of +$\rho$-global zCDP in these settings. These lower bounds reveal two hardness +regimes based on the privacy budget $\rho$ and suggest that $\rho$-global zCDP +incurs less regret than pure $\epsilon$-global DP. We propose two $\rho$-global +zCDP bandit algorithms, AdaC-UCB and AdaC-GOPE, for finite-armed and linear +bandits respectively. Both algorithms use a common recipe of Gaussian mechanism +and adaptive episodes. We analyze the regret of these algorithms to show that +AdaC-UCB achieves the problem-dependent regret lower bound up to multiplicative +constants, while AdaC-GOPE achieves the minimax regret lower bound up to +poly-logarithmic factors. Finally, we provide experimental validation of our +theoretical results under different settings. + +
+
+
+
+
+ + ☆ Curating Naturally Adversarial Datasets for Trustworthy AI in Healthcare + + +
+ Deep learning models have shown promising predictive accuracy for time-series +healthcare applications. However, ensuring the robustness of these models is +vital for building trustworthy AI systems. Existing research predominantly +focuses on robustness to synthetic adversarial examples, crafted by adding +imperceptible perturbations to clean input data. However, these synthetic +adversarial examples do not accurately reflect the most challenging real-world +scenarios, especially in the context of healthcare data. Consequently, +robustness to synthetic adversarial examples may not necessarily translate to +robustness against naturally occurring adversarial examples, which is highly +desirable for trustworthy AI. We propose a method to curate datasets comprised +of natural adversarial examples to evaluate model robustness. The method relies +on probabilistic labels obtained from automated weakly-supervised labeling that +combines noisy and cheap-to-obtain labeling heuristics. Based on these labels, +our method adversarially orders the input data and uses this ordering to +construct a sequence of increasingly adversarial datasets. Our evaluation on +six medical case studies and three non-medical case studies demonstrates the +efficacy and statistical validity of our approach to generating naturally +adversarial datasets + +
+
+
+
+
+ + ☆ Adaptive function approximation based on the Discrete Cosine Transform + (DCT) SC + + +
+ This paper studies the cosine as basis function for the approximation of +univariate and continuous functions without memory. This work studies a +supervised learning to obtain the approximation coefficients, instead of using +the Discrete Cosine Transform (DCT). Due to the finite dynamics and +orthogonality of the cosine basis functions, simple gradient algorithms, such +as the Normalized Least Mean Squares (NLMS), can benefit from it and present a +controlled and predictable convergence time and error misadjustment. Due to its +simplicity, the proposed technique ranks as the best in terms of learning +quality versus complexity, and it is presented as an attractive technique to be +used in more complex supervised learning systems. Simulations illustrate the +performance of the approach. This paper celebrates the 50th anniversary of the +publication of the DCT by Nasir Ahmed in 1973. + +
+
+ comment: Accepted paper in 26th International Conference on Circuits, Systems, + Communications and Computers (CSCC) +
+
+
+
+
+ + ☆ Online Distributed Learning over Random Networks + + +
+ The recent deployment of multi-agent systems in a wide range of scenarios has +enabled the solution of learning problems in a distributed fashion. In this +context, agents are tasked with collecting local data and then cooperatively +train a model, without directly sharing the data. While distributed learning +offers the advantage of preserving agents' privacy, it also poses several +challenges in terms of designing and analyzing suitable algorithms. This work +focuses specifically on the following challenges motivated by practical +implementation: (i) online learning, where the local data change over time; +(ii) asynchronous agent computations; (iii) unreliable and limited +communications; and (iv) inexact local computations. To tackle these +challenges, we introduce the Distributed Operator Theoretical (DOT) version of +the Alternating Direction Method of Multipliers (ADMM), which we call the +DOT-ADMM Algorithm. We prove that it converges with a linear rate for a large +class of convex learning problems (e.g., linear and logistic regression +problems) toward a bounded neighborhood of the optimal time-varying solution, +and characterize how the neighborhood depends on~$\text{(i)--(iv)}$. We +corroborate the theoretical analysis with numerical simulations comparing the +DOT-ADMM Algorithm with other state-of-the-art algorithms, showing that only +the proposed algorithm exhibits robustness to (i)--(iv). + +
+
+
+
+
+ + ☆ Structure and Gradient Dynamics Near Global Minima of Two-layer Neural + Networks + + +
+ Under mild assumptions, we investigate the structure of loss landscape of +two-layer neural networks near global minima, determine the set of parameters +which give perfect generalization, and fully characterize the gradient flows +around it. With novel techniques, our work uncovers some simple aspects of the +complicated loss landscape and reveals how model, target function, samples and +initialization affect the training dynamics differently. Based on these +results, we also explain why (overparametrized) neural networks could +generalize well. + +
+
+
+
+
+ + ☆ Application of Deep Learning Methods in Monitoring and Optimization of + Electric Power Systems + + +
+ This PhD thesis thoroughly examines the utilization of deep learning +techniques as a means to advance the algorithms employed in the monitoring and +optimization of electric power systems. The first major contribution of this +thesis involves the application of graph neural networks to enhance power +system state estimation. The second key aspect of this thesis focuses on +utilizing reinforcement learning for dynamic distribution network +reconfiguration. The effectiveness of the proposed methods is affirmed through +extensive experimentation and simulations. + +
+
+ comment: PhD thesis +
+
+
+
+
+ + ☆ Multi-stage Deep Learning Artifact Reduction for Computed Tomography + + +
+ In Computed Tomography (CT), an image of the interior structure of an object +is computed from a set of acquired projection images. The quality of these +reconstructed images is essential for accurate analysis, but this quality can +be degraded by a variety of imaging artifacts. To improve reconstruction +quality, the acquired projection images are often processed by a pipeline +consisting of multiple artifact-removal steps applied in various image domains +(e.g., outlier removal on projection images and denoising of reconstruction +images). These artifact-removal methods exploit the fact that certain artifacts +are easier to remove in a certain domain compared with other domains. + Recently, deep learning methods have shown promising results for artifact +removal for CT images. However, most existing deep learning methods for CT are +applied as a post-processing method after reconstruction. Therefore, artifacts +that are relatively difficult to remove in the reconstruction domain may not be +effectively removed by these methods. As an alternative, we propose a +multi-stage deep learning method for artifact removal, in which neural networks +are applied to several domains, similar to a classical CT processing pipeline. +We show that the neural networks can be effectively trained in succession, +resulting in easy-to-use and computationally efficient training. Experiments on +both simulated and real-world experimental datasets show that our method is +effective in reducing artifacts and superior to deep learning-based +post-processing. + +
+
+
+
+
+ + ☆ How Does Forecasting Affect the Convergence of DRL Techniques in O-RAN + Slicing? + + +
+ The success of immersive applications such as virtual reality (VR) gaming and +metaverse services depends on low latency and reliable connectivity. To provide +seamless user experiences, the open radio access network (O-RAN) architecture +and 6G networks are expected to play a crucial role. RAN slicing, a critical +component of the O-RAN paradigm, enables network resources to be allocated +based on the needs of immersive services, creating multiple virtual networks on +a single physical infrastructure. In the O-RAN literature, deep reinforcement +learning (DRL) algorithms are commonly used to optimize resource allocation. +However, the practical adoption of DRL in live deployments has been sluggish. +This is primarily due to the slow convergence and performance instabilities +suffered by the DRL agents both upon initial deployment and when there are +significant changes in network conditions. In this paper, we investigate the +impact of time series forecasting of traffic demands on the convergence of the +DRL-based slicing agents. For that, we conduct an exhaustive experiment that +supports multiple services including real VR gaming traffic. We then propose a +novel forecasting-aided DRL approach and its respective O-RAN practical +deployment workflow to enhance DRL convergence. Our approach shows up to 22.8%, +86.3%, and 300% improvements in the average initial reward value, convergence +rate, and number of converged scenarios respectively, enhancing the +generalizability of the DRL agents compared with the implemented baselines. The +results also indicate that our approach is robust against forecasting errors +and that forecasting models do not have to be ideal. + +
+
+ comment: This article has been accepted for presentation in IEEE GLOBECOM 2023 +
+
+
+
+
+ + ☆ Geometry-aware Line Graph Transformer Pre-training for Molecular + Property Prediction + + +
+ Molecular property prediction with deep learning has gained much attention +over the past years. Owing to the scarcity of labeled molecules, there has been +growing interest in self-supervised learning methods that learn generalizable +molecular representations from unlabeled data. Molecules are typically treated +as 2D topological graphs in modeling, but it has been discovered that their 3D +geometry is of great importance in determining molecular functionalities. In +this paper, we propose the Geometry-aware line graph transformer (Galformer) +pre-training, a novel self-supervised learning framework that aims to enhance +molecular representation learning with 2D and 3D modalities. Specifically, we +first design a dual-modality line graph transformer backbone to encode the +topological and geometric information of a molecule. The designed backbone +incorporates effective structural encodings to capture graph structures from +both modalities. Then we devise two complementary pre-training tasks at the +inter and intra-modality levels. These tasks provide properly supervised +information and extract discriminative 2D and 3D knowledge from unlabeled +molecules. Finally, we evaluate Galformer against six state-of-the-art +baselines on twelve property prediction benchmarks via downstream fine-tuning. +Experimental results show that Galformer consistently outperforms all baselines +on both classification and regression tasks, demonstrating its effectiveness. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ New metrics for analyzing continual learners + + +
+ Deep neural networks have shown remarkable performance when trained on +independent and identically distributed data from a fixed set of classes. +However, in real-world scenarios, it can be desirable to train models on a +continuous stream of data where multiple classification tasks are presented +sequentially. This scenario, known as Continual Learning (CL) poses challenges +to standard learning algorithms which struggle to maintain knowledge of old +tasks while learning new ones. This stability-plasticity dilemma remains +central to CL and multiple metrics have been proposed to adequately measure +stability and plasticity separately. However, none considers the increasing +difficulty of the classification task, which inherently results in performance +loss for any model. In that sense, we analyze some limitations of current +metrics and identify the presence of setup-induced forgetting. Therefore, we +propose new metrics that account for the task's increasing difficulty. Through +experiments on benchmark datasets, we demonstrate that our proposed metrics can +provide new insights into the stability-plasticity trade-off achieved by models +in the continual learning environment. + +
+
+ comment: 6 pages, presented at MIRU 2023 +
+
+
+
+
+ + ☆ A Locality-based Neural Solver for Optical Motion Capture + + +
+ We present a novel locality-based learning method for cleaning and solving +optical motion capture data. Given noisy marker data, we propose a new +heterogeneous graph neural network which treats markers and joints as different +types of nodes, and uses graph convolution operations to extract the local +features of markers and joints and transform them to clean motions. To deal +with anomaly markers (e.g. occluded or with big tracking errors), the key +insight is that a marker's motion shows strong correlations with the motions of +its immediate neighboring markers but less so with other markers, a.k.a. +locality, which enables us to efficiently fill missing markers (e.g. due to +occlusion). Additionally, we also identify marker outliers due to tracking +errors by investigating their acceleration profiles. Finally, we propose a +training regime based on representation learning and data augmentation, by +training the model on data with masking. The masking schemes aim to mimic the +occluded and noisy markers often observed in the real data. Finally, we show +that our method achieves high accuracy on multiple metrics across various +datasets. Extensive comparison shows our method outperforms state-of-the-art +methods in terms of prediction accuracy of occluded marker position error by +approximately 20%, which leads to a further error reduction on the +reconstructed joint rotations and positions by 30%. The code and data for this +paper are available at https://github.com/non-void/LocalMoCap. + +
+
+ comment: Siggraph Asia 2023 Conference Paper +
+
+
+
+
+ + ☆ Declarative Reasoning on Explanations Using Constraint Logic Programming + + +
+ Explaining opaque Machine Learning (ML) models is an increasingly relevant +problem. Current explanation in AI (XAI) methods suffer several shortcomings, +among others an insufficient incorporation of background knowledge, and a lack +of abstraction and interactivity with the user. We propose REASONX, an +explanation method based on Constraint Logic Programming (CLP). REASONX can +provide declarative, interactive explanations for decision trees, which can be +the ML models under analysis or global/local surrogate models of any black-box +model. Users can express background or common sense knowledge using linear +constraints and MILP optimization over features of factual and contrastive +instances, and interact with the answer constraints at different levels of +abstraction through constraint projection. We present here the architecture of +REASONX, which consists of a Python layer, closer to the user, and a CLP layer. +REASONX's core execution engine is a Prolog meta-program with declarative +semantics in terms of logic theories. + +
+
+ comment: European Conference on Logics in Artificial Intelligence (JELIA 2023) +
+
+
+
+
+ + ☆ Area-norm COBRA on Conditional Survival Prediction + + +
+ The paper explores a different variation of combined regression strategy to +calculate the conditional survival function. We use regression based weak +learners to create the proposed ensemble technique. The proposed combined +regression strategy uses proximity measure as area between two survival curves. +The proposed model shows a construction which ensures that it performs better +than the Random Survival Forest. The paper discusses a novel technique to +select the most important variable in the combined regression setup. We perform +a simulation study to show that our proposition for finding relevance of the +variables works quite well. We also use three real-life datasets to illustrate +the model. + +
+
+
+
+
+ + ☆ Advancing Personalized Federated Learning: Group Privacy, Fairness, and + Beyond + + +
+ Federated learning (FL) is a framework for training machine learning models +in a distributed and collaborative manner. During training, a set of +participating clients process their data stored locally, sharing only the model +updates obtained by minimizing a cost function over their local inputs. FL was +proposed as a stepping-stone towards privacy-preserving machine learning, but +it has been shown vulnerable to issues such as leakage of private information, +lack of personalization of the model, and the possibility of having a trained +model that is fairer to some groups than to others. In this paper, we address +the triadic interaction among personalization, privacy guarantees, and fairness +attained by models trained within the FL framework. Differential privacy and +its variants have been studied and applied as cutting-edge standards for +providing formal privacy guarantees. However, clients in FL often hold very +diverse datasets representing heterogeneous communities, making it important to +protect their sensitive information while still ensuring that the trained model +upholds the aspect of fairness for the users. To attain this objective, a +method is put forth that introduces group privacy assurances through the +utilization of $d$-privacy (aka metric privacy). $d$-privacy represents a +localized form of differential privacy that relies on a metric-oriented +obfuscation approach to maintain the original data's topological distribution. +This method, besides enabling personalized model training in a federated +approach and providing formal privacy guarantees, possesses significantly +better group fairness measured under a variety of standard metrics than a +global model trained within a classical FL template. Theoretical justifications +for the applicability are provided, as well as experimental validation on +real-world datasets to illustrate the working of the proposed method. + +
+
+
+
+
+ + ☆ Selective Scene Text Removal + + +
+ Scene text removal (STR) is the image transformation task to remove text +regions in scene images. The conventional STR methods remove all scene text. +This means that the existing methods cannot select text to be removed. In this +paper, we propose a novel task setting named selective scene text removal +(SSTR) that removes only target words specified by the user. Although SSTR is a +more complex task than STR, the proposed multi-module structure enables +efficient training for SSTR. Experimental results show that the proposed method +can remove target words as expected. + +
+
+ comment: 12 pages, 8 figures, Accepted at the 34th British Machine Vision + Conference +
+
+
+
+
+ + ☆ Learning multi-modal generative models with permutation-invariant + encoders and tighter variational bounds + + +
+ Devising deep latent variable models for multi-modal data has been a +long-standing theme in machine learning research. Multi-modal Variational +Autoencoders (VAEs) have been a popular generative model class that learns +latent representations which jointly explain multiple modalities. Various +objective functions for such models have been suggested, often motivated as +lower bounds on the multi-modal data log-likelihood or from +information-theoretic considerations. In order to encode latent variables from +different modality subsets, Product-of-Experts (PoE) or Mixture-of-Experts +(MoE) aggregation schemes have been routinely used and shown to yield different +trade-offs, for instance, regarding their generative quality or consistency +across multiple modalities. In this work, we consider a variational bound that +can tightly lower bound the data log-likelihood. We develop more flexible +aggregation schemes that generalise PoE or MoE approaches by combining encoded +features from different modalities based on permutation-invariant neural +networks. Our numerical experiments illustrate trade-offs for multi-modal +variational bounds and various aggregation schemes. We show that tighter +variational bounds and more flexible aggregation models can become beneficial +when one wants to approximate the true joint distribution over observed +modalities and latent variables in identifiable models. + +
+
+
+
+
+ + ☆ Anomaly detection with semi-supervised classification based on risk + estimators + + +
+ A significant limitation of one-class classification anomaly detection +methods is their reliance on the assumption that unlabeled training data only +contains normal instances. To overcome this impractical assumption, we propose +two novel classification-based anomaly detection methods. Firstly, we introduce +a semi-supervised shallow anomaly detection method based on an unbiased risk +estimator. Secondly, we present a semi-supervised deep anomaly detection method +utilizing a nonnegative (biased) risk estimator. We establish estimation error +bounds and excess risk bounds for both risk minimizers. Additionally, we +propose techniques to select appropriate regularization parameters that ensure +the nonnegativity of the empirical risk in the shallow model under specific +loss functions. Our extensive experiments provide strong evidence of the +effectiveness of the risk-based anomaly detection methods. + +
+
+
+
+
+ + ☆ Where Did the Gap Go? Reassessing the Long-Range Graph Benchmark + + +
+ The recent Long-Range Graph Benchmark (LRGB, Dwivedi et al. 2022) introduced +a set of graph learning tasks strongly dependent on long-range interaction +between vertices. Empirical evidence suggests that on these tasks Graph +Transformers significantly outperform Message Passing GNNs (MPGNNs). In this +paper, we carefully reevaluate multiple MPGNN baselines as well as the Graph +Transformer GPS (Ramp\'a\v{s}ek et al. 2022) on LRGB. Through a rigorous +empirical analysis, we demonstrate that the reported performance gap is +overestimated due to suboptimal hyperparameter choices. It is noteworthy that +across multiple datasets the performance gap completely vanishes after basic +hyperparameter optimization. In addition, we discuss the impact of lacking +feature normalization for LRGB's vision datasets and highlight a spurious +implementation of LRGB's link prediction metric. The principal aim of our paper +is to establish a higher standard of empirical rigor within the graph machine +learning community. + +
+
+
+
+
+ + ☆ FederatedScope-LLM: A Comprehensive Package for Fine-tuning Large + Language Models in Federated Learning + + +
+ LLMs have demonstrated great capabilities in various NLP tasks. Different +entities can further improve the performance of those LLMs on their specific +downstream tasks by fine-tuning LLMs. When several entities have similar +interested tasks, but their data cannot be shared because of privacy concerns +regulations, federated learning (FL) is a mainstream solution to leverage the +data of different entities. However, fine-tuning LLMs in federated learning +settings still lacks adequate support from existing FL frameworks because it +has to deal with optimizing the consumption of significant communication and +computational resources, data preparation for different tasks, and distinct +information protection demands. This paper first discusses these challenges of +federated fine-tuning LLMs, and introduces our package FS-LLM as a main +contribution, which consists of the following components: (1) we build an +end-to-end benchmarking pipeline, automizing the processes of dataset +preprocessing, federated fine-tuning execution, and performance evaluation on +federated LLM fine-tuning; (2) we provide comprehensive federated +parameter-efficient fine-tuning algorithm implementations and versatile +programming interfaces for future extension in FL scenarios with low +communication and computation costs, even without accessing the full model; (3) +we adopt several accelerating and resource-efficient operators for fine-tuning +LLMs with limited resources and the flexible pluggable sub-routines for +interdisciplinary study. We conduct extensive experiments to validate the +effectiveness of FS-LLM and benchmark advanced LLMs with state-of-the-art +parameter-efficient fine-tuning algorithms in FL settings, which also yields +valuable insights into federated fine-tuning LLMs for the research community. +To facilitate further research and adoption, we release FS-LLM at +https://github.com/alibaba/FederatedScope/tree/llm. + +
+
+ comment: Source code: https://github.com/alibaba/FederatedScope/tree/llm +
+
+
+
+
+ + ☆ Explainable Active Learning for Preference Elicitation + + +
+ Gaining insights into the preferences of new users and subsequently +personalizing recommendations necessitate managing user interactions +intelligently, namely, posing pertinent questions to elicit valuable +information effectively. In this study, our focus is on a specific scenario of +the cold-start problem, where the recommendation system lacks adequate user +presence or access to other users' data is restricted, obstructing employing +user profiling methods utilizing existing data in the system. We employ Active +Learning (AL) to solve the addressed problem with the objective of maximizing +information acquisition with minimal user effort. AL operates for selecting +informative data from a large unlabeled set to inquire an oracle to label them +and eventually updating a machine learning (ML) model. We operate AL in an +integrated process of unsupervised, semi-supervised, and supervised ML within +an explanatory preference elicitation process. It harvests user feedback (given +for the system's explanations on the presented items) over informative samples +to update an underlying ML model estimating user preferences. The designed user +interaction facilitates personalizing the system by incorporating user feedback +into the ML model and also enhances user trust by refining the system's +explanations on recommendations. We implement the proposed preference +elicitation methodology for food recommendation. We conducted human experiments +to assess its efficacy in the short term and also experimented with several AL +strategies over synthetic user profiles that we created for two food datasets, +aiming for long-term performance analysis. The experimental results demonstrate +the efficiency of the proposed preference elicitation with limited user-labeled +data while also enhancing user trust through accurate explanations. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Bespoke Nanoparticle Synthesis and Chemical Knowledge Discovery Via + Autonomous Experimentations + + +
+ The optimization of nanomaterial synthesis using numerous synthetic variables +is considered to be extremely laborious task because the conventional +combinatorial explorations are prohibitively expensive. In this work, we report +an autonomous experimentation platform developed for the bespoke design of +nanoparticles (NPs) with targeted optical properties. This platform operates in +a closed-loop manner between a batch synthesis module of NPs and a UV- Vis +spectroscopy module, based on the feedback of the AI optimization modeling. +With silver (Ag) NPs as a representative example, we demonstrate that the +Bayesian optimizer implemented with the early stopping criterion can +efficiently produce Ag NPs precisely possessing the desired absorption spectra +within only 200 iterations (when optimizing among five synthetic reagents). In +addition to the outstanding material developmental efficiency, the analysis of +synthetic variables further reveals a novel chemistry involving the effects of +citrate in Ag NP synthesis. The amount of citrate is a key to controlling the +competitions between spherical and plate-shaped NPs and, as a result, affects +the shapes of the absorption spectra as well. Our study highlights both +capabilities of the platform to enhance search efficiencies and to provide a +novel chemical knowledge by analyzing datasets accumulated from the autonomous +experimentations. + +
+
+
+
+
+ + ☆ Multitask Deep Learning for Accurate Risk Stratification and Prediction + of Next Steps for Coronary CT Angiography Patients + + +
+ Diagnostic investigation has an important role in risk stratification and +clinical decision making of patients with suspected and documented Coronary +Artery Disease (CAD). However, the majority of existing tools are primarily +focused on the selection of gatekeeper tests, whereas only a handful of systems +contain information regarding the downstream testing or treatment. We propose a +multi-task deep learning model to support risk stratification and down-stream +test selection for patients undergoing Coronary Computed Tomography Angiography +(CCTA). The analysis included 14,021 patients who underwent CCTA between 2006 +and 2017. Our novel multitask deep learning framework extends the state-of-the +art Perceiver model to deal with real-world CCTA report data. Our model +achieved an Area Under the receiver operating characteristic Curve (AUC) of +0.76 in CAD risk stratification, and 0.72 AUC in predicting downstream tests. +Our proposed deep learning model can accurately estimate the likelihood of CAD +and provide recommended downstream tests based on prior CCTA data. In clinical +practice, the utilization of such an approach could bring a paradigm shift in +risk stratification and downstream management. Despite significant progress +using deep learning models for tabular data, they do not outperform gradient +boosting decision trees, and further research is required in this area. +However, neural networks appear to benefit more readily from multi-task +learning than tree-based models. This could offset the shortcomings of using +single task learning approach when working with tabular data. + +
+
+
+
+
+ + ☆ Mi-Go: Test Framework which uses YouTube as Data Source for Evaluating + Speech Recognition Models like OpenAI's Whisper + + +
+ This article introduces Mi-Go, a novel testing framework aimed at evaluating +the performance and adaptability of general-purpose speech recognition machine +learning models across diverse real-world scenarios. The framework leverages +YouTube as a rich and continuously updated data source, accounting for multiple +languages, accents, dialects, speaking styles, and audio quality levels. To +demonstrate the effectiveness of the framework, the Whisper model, developed by +OpenAI, was employed as a test object. The tests involve using a total of 124 +YouTube videos to test all Whisper model versions. The results underscore the +utility of YouTube as a valuable testing platform for speech recognition +models, ensuring their robustness, accuracy, and adaptability to diverse +languages and acoustic conditions. Additionally, by contrasting the +machine-generated transcriptions against human-made subtitles, the Mi-Go +framework can help pinpoint potential misuse of YouTube subtitles, like Search +Engine Optimization. + +
+
+ comment: 25 pages, 9 tables, 3 figures +
+
+
+
+
+ + ☆ Multi-fidelity reduced-order surrogate modeling + + +
+ High-fidelity numerical simulations of partial differential equations (PDEs) +given a restricted computational budget can significantly limit the number of +parameter configurations considered and/or time window evaluated for modeling a +given system. Multi-fidelity surrogate modeling aims to leverage less accurate, +lower-fidelity models that are computationally inexpensive in order to enhance +predictive accuracy when high-fidelity data are limited or scarce. However, +low-fidelity models, while often displaying important qualitative +spatio-temporal features, fail to accurately capture the onset of instability +and critical transients observed in the high-fidelity models, making them +impractical as surrogate models. To address this shortcoming, we present a new +data-driven strategy that combines dimensionality reduction with multi-fidelity +neural network surrogates. The key idea is to generate a spatial basis by +applying the classical proper orthogonal decomposition (POD) to high-fidelity +solution snapshots, and approximate the dynamics of the reduced states - +time-parameter-dependent expansion coefficients of the POD basis - using a +multi-fidelity long-short term memory (LSTM) network. By mapping low-fidelity +reduced states to their high-fidelity counterpart, the proposed reduced-order +surrogate model enables the efficient recovery of full solution fields over +time and parameter variations in a non-intrusive manner. The generality and +robustness of this method is demonstrated by a collection of parametrized, +time-dependent PDE problems where the low-fidelity model can be defined by +coarser meshes and/or time stepping, as well as by misspecified physical +features. Importantly, the onset of instabilities and transients are well +captured by this surrogate modeling technique. + +
+
+
+
+
+ + ☆ Efficient Surrogate Models for Materials Science Simulations: Machine + Learning-based Prediction of Microstructure Properties + + +
+ Determining, understanding, and predicting the so-called structure-property +relation is an important task in many scientific disciplines, such as +chemistry, biology, meteorology, physics, engineering, and materials science. +Structure refers to the spatial distribution of, e.g., substances, material, or +matter in general, while property is a resulting characteristic that usually +depends in a non-trivial way on spatial details of the structure. +Traditionally, forward simulations models have been used for such tasks. +Recently, several machine learning algorithms have been applied in these +scientific fields to enhance and accelerate simulation models or as surrogate +models. In this work, we develop and investigate the applications of six +machine learning techniques based on two different datasets from the domain of +materials science: data from a two-dimensional Ising model for predicting the +formation of magnetic domains and data representing the evolution of dual-phase +microstructures from the Cahn-Hilliard model. We analyze the accuracy and +robustness of all models and elucidate the reasons for the differences in their +performances. The impact of including domain knowledge through tailored +features is studied, and general recommendations based on the availability and +quality of training data are derived from this. + +
+
+
+
+
+ + ☆ End-to-end Lidar-Driven Reinforcement Learning for Autonomous Racing + + +
+ Reinforcement Learning (RL) has emerged as a transformative approach in the +domains of automation and robotics, offering powerful solutions to complex +problems that conventional methods struggle to address. In scenarios where the +problem definitions are elusive and challenging to quantify, learning-based +solutions such as RL become particularly valuable. One instance of such +complexity can be found in the realm of car racing, a dynamic and unpredictable +environment that demands sophisticated decision-making algorithms. This study +focuses on developing and training an RL agent to navigate a racing environment +solely using feedforward raw lidar and velocity data in a simulated context. +The agent's performance, trained in the simulation environment, is then +experimentally evaluated in a real-world racing scenario. This exploration +underlines the feasibility and potential benefits of RL algorithm enhancing +autonomous racing performance, especially in the environments where prior map +information is not available. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ RLAIF: Scaling Reinforcement Learning from Human Feedback with AI + Feedback + + +
+ Reinforcement learning from human feedback (RLHF) is effective at aligning +large language models (LLMs) to human preferences, but gathering high quality +human preference labels is a key bottleneck. We conduct a head-to-head +comparison of RLHF vs. RL from AI Feedback (RLAIF) - a technique where +preferences are labeled by an off-the-shelf LLM in lieu of humans, and we find +that they result in similar improvements. On the task of summarization, human +evaluators prefer generations from both RLAIF and RLHF over a baseline +supervised fine-tuned model in ~70% of cases. Furthermore, when asked to rate +RLAIF vs. RLHF summaries, humans prefer both at equal rates. These results +suggest that RLAIF can yield human-level performance, offering a potential +solution to the scalability limitations of RLHF. + +
+
+
+
+
+ + ☆ Leveraging Learning Metrics for Improved Federated Learning + + +
+ Currently in the federated setting, no learning schemes leverage the emerging +research of explainable artificial intelligence (XAI) in particular the novel +learning metrics that help determine how well a model is learning. One of these +novel learning metrics is termed `Effective Rank' (ER) which measures the +Shannon Entropy of the singular values of a matrix, thus enabling a metric +determining how well a layer is mapping. By joining federated learning and the +learning metric, effective rank, this work will \textbf{(1)} give the first +federated learning metric aggregation method \textbf{(2)} show that effective +rank is well-suited to federated problems by out-performing baseline Federated +Averaging \cite{konevcny2016federated} and \textbf{(3)} develop a novel +weight-aggregation scheme relying on effective rank. + +
+
+ comment: Bachelor's thesis +
+
+
+
+
+ + ☆ SortedNet, a Place for Every Network and Every Network in its Place: + Towards a Generalized Solution for Training Many-in-One Neural Networks + + +
+ As the size of deep learning models continues to grow, finding optimal models +under memory and computation constraints becomes increasingly more important. +Although usually the architecture and constituent building blocks of neural +networks allow them to be used in a modular way, their training process is not +aware of this modularity. Consequently, conventional neural network training +lacks the flexibility to adapt the computational load of the model during +inference. This paper proposes SortedNet, a generalized and scalable solution +to harness the inherent modularity of deep neural networks across various +dimensions for efficient dynamic inference. Our training considers a nested +architecture for the sub-models with shared parameters and trains them together +with the main model in a sorted and probabilistic manner. This sorted training +of sub-networks enables us to scale the number of sub-networks to hundreds +using a single round of training. We utilize a novel updating scheme during +training that combines random sampling of sub-networks with gradient +accumulation to improve training efficiency. Furthermore, the sorted nature of +our training leads to a search-free sub-network selection at inference time; +and the nested architecture of the resulting sub-networks leads to minimal +storage requirement and efficient switching between sub-networks at inference. +Our general dynamic training approach is demonstrated across various +architectures and tasks, including large language models and pre-trained vision +models. Experimental results show the efficacy of the proposed approach in +achieving efficient sub-networks while outperforming state-of-the-art dynamic +training approaches. Our findings demonstrate the feasibility of training up to +160 different sub-models simultaneously, showcasing the extensive scalability +of our proposed method while maintaining 96% of the model performance. + +
+
+
+
+
+ + ☆ Why do universal adversarial attacks work on large language models?: + Geometry might be the answer + + +
+ Transformer based large language models with emergent capabilities are +becoming increasingly ubiquitous in society. However, the task of understanding +and interpreting their internal workings, in the context of adversarial +attacks, remains largely unsolved. Gradient-based universal adversarial attacks +have been shown to be highly effective on large language models and potentially +dangerous due to their input-agnostic nature. This work presents a novel +geometric perspective explaining universal adversarial attacks on large +language models. By attacking the 117M parameter GPT-2 model, we find evidence +indicating that universal adversarial triggers could be embedding vectors which +merely approximate the semantic information in their adversarial training +region. This hypothesis is supported by white-box model analysis comprising +dimensionality reduction and similarity measurement of hidden representations. +We believe this new geometric perspective on the underlying mechanism driving +universal attacks could help us gain deeper insight into the internal workings +and failure modes of LLMs, thus enabling their mitigation. + +
+
+ comment: 2nd AdvML Frontiers Workshop at 40th International Conference on + Machine Learning, Honolulu, Hawaii, USA, 2023 +
+
+
+
+
+ + ☆ Interpretable Medical Imagery Diagnosis with Self-Attentive + Transformers: A Review of Explainable AI for Health Care + + +
+ Recent advancements in artificial intelligence (AI) have facilitated its +widespread adoption in primary medical services, addressing the demand-supply +imbalance in healthcare. Vision Transformers (ViT) have emerged as +state-of-the-art computer vision models, benefiting from self-attention +modules. However, compared to traditional machine-learning approaches, +deep-learning models are complex and are often treated as a "black box" that +can cause uncertainty regarding how they operate. Explainable Artificial +Intelligence (XAI) refers to methods that explain and interpret machine +learning models' inner workings and how they come to decisions, which is +especially important in the medical domain to guide the healthcare +decision-making process. This review summarises recent ViT advancements and +interpretative approaches to understanding the decision-making process of ViT, +enabling transparency in medical diagnosis applications. + +
+
+
+
+
+ + ☆ NeuroSurgeon: A Toolkit for Subnetwork Analysis + + +
+ Despite recent advances in the field of explainability, much remains unknown +about the algorithms that neural networks learn to represent. Recent work has +attempted to understand trained models by decomposing them into functional +circuits (Csord\'as et al., 2020; Lepori et al., 2023). To advance this +research, we developed NeuroSurgeon, a python library that can be used to +discover and manipulate subnetworks within models in the Huggingface +Transformers library (Wolf et al., 2019). NeuroSurgeon is freely available at +https://github.com/mlepori1/NeuroSurgeon. + +
+
+
+
+
+ + ☆ Image Hijacking: Adversarial Images can Control Generative Models at + Runtime + + +
+ Are foundation models secure from malicious actors? In this work, we focus on +the image input to a vision-language model (VLM). We discover image hijacks, +adversarial images that control generative models at runtime. We introduce +Behavior Matching, a general method for creating image hijacks, and we use it +to explore three types of attacks. Specific string attacks generate arbitrary +output of the adversary's choosing. Leak context attacks leak information from +the context window into the output. Jailbreak attacks circumvent a model's +safety training. We study these attacks against LLaVA-2, a state-of-the-art VLM +based on CLIP and LLaMA-2, and find that all our attack types have above a 90\% +success rate. Moreover, our attacks are automated and require only small image +perturbations. These findings raise serious concerns about the security of +foundation models. If image hijacks are as difficult to defend against as +adversarial examples in CIFAR-10, then it might be many years before a solution +is found -- if it even exists. + +
+
+ comment: Code is available at https://github.com/euanong/image-hijacks +
+
+
+
+
+ + ☆ Data-Driven Projection for Reducing Dimensionality of Linear Programs: + Generalization Bound and Learning Methods + + +
+ This paper studies a simple data-driven approach to high-dimensional linear +programs (LPs). Given data of past $n$-dimensional LPs, we learn an $n\times k$ +\textit{projection matrix} ($n > k$), which reduces the dimensionality from $n$ +to $k$. Then, we address future LP instances by solving $k$-dimensional LPs and +recovering $n$-dimensional solutions by multiplying the projection matrix. This +idea is compatible with any user-preferred LP solvers, hence a versatile +approach to faster LP solving. One natural question is: how much data is +sufficient to ensure the recovered solutions' quality? We address this question +based on the idea of \textit{data-driven algorithm design}, which relates the +amount of data sufficient for generalization guarantees to the +\textit{pseudo-dimension} of performance metrics. We present an +$\tilde{\mathrm{O}}(nk^2)$ upper bound on the pseudo-dimension +($\tilde{\mathrm{O}}$ compresses logarithmic factors) and complement it by an +$\Omega(nk)$ lower bound, hence tight up to an $\tilde{\mathrm{O}}(k)$ factor. +On the practical side, we study two natural methods for learning projection +matrices: PCA- and gradient-based methods. While the former is simple and +efficient, the latter sometimes leads to better solution quality. Experiments +confirm that learned projection matrices are beneficial for reducing the time +for solving LPs while maintaining high solution quality. + +
+
+
+
+
+ + ☆ Subjectivity in Unsupervised Machine Learning Model Selection + + +
+ Model selection is a necessary step in unsupervised machine learning. Despite +numerous criteria and metrics, model selection remains subjective. A high +degree of subjectivity may lead to questions about repeatability and +reproducibility of various machine learning studies and doubts about the +robustness of models deployed in the real world. Yet, the impact of modelers' +preferences on model selection outcomes remains largely unexplored. This study +uses the Hidden Markov Model as an example to investigate the subjectivity +involved in model selection. We asked 33 participants and three Large Language +Models (LLMs) to make model selections in three scenarios. Results revealed +variability and inconsistencies in both the participants' and the LLMs' +choices, especially when different criteria and metrics disagree. Sources of +subjectivity include varying opinions on the importance of different criteria +and metrics, differing views on how parsimonious a model should be, and how the +size of a dataset should influence model selection. The results underscore the +importance of developing a more standardized way to document subjective choices +made in model selection processes. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ☆ Diffusion Model with Clustering-based Conditioning for Food Image + Generation + + +
+ Image-based dietary assessment serves as an efficient and accurate solution +for recording and analyzing nutrition intake using eating occasion images as +input. Deep learning-based techniques are commonly used to perform image +analysis such as food classification, segmentation, and portion size +estimation, which rely on large amounts of food images with annotations for +training. However, such data dependency poses significant barriers to +real-world applications, because acquiring a substantial, diverse, and balanced +set of food images can be challenging. One potential solution is to use +synthetic food images for data augmentation. Although existing work has +explored the use of generative adversarial networks (GAN) based structures for +generation, the quality of synthetic food images still remains subpar. In +addition, while diffusion-based generative models have shown promising results +for general image generation tasks, the generation of food images can be +challenging due to the substantial intra-class variance. In this paper, we +investigate the generation of synthetic food images based on the conditional +diffusion model and propose an effective clustering-based training framework, +named ClusDiff, for generating high-quality and representative food images. The +proposed method is evaluated on the Food-101 dataset and shows improved +performance when compared with existing image generation works. We also +demonstrate that the synthetic food images generated by ClusDiff can help +address the severe class imbalance issue in long-tailed food classification +using the VFN-LT dataset. + +
+
+ comment: Accepted for 31st ACM International Conference on Multimedia: 8th + International Workshop on Multimedia Assisted Dietary Management (MADiMa + 2023) +
+
+
+
+
+ + ☆ Deep-learning-based Early Fixing for Gas-lifted Oil Production + Optimization: Supervised and Weakly-supervised Approaches + + +
+ Maximizing oil production from gas-lifted oil wells entails solving +Mixed-Integer Linear Programs (MILPs). As the parameters of the wells, such as +the basic-sediment-to-water ratio and the gas-oil ratio, are updated, the +problems must be repeatedly solved. Instead of relying on costly exact methods +or the accuracy of general approximate methods, in this paper, we propose a +tailor-made heuristic solution based on deep learning models trained to provide +values to all integer variables given varying well parameters, early-fixing the +integer variables and, thus, reducing the original problem to a linear program +(LP). We propose two approaches for developing the learning-based heuristic: a +supervised learning approach, which requires the optimal integer values for +several instances of the original problem in the training set, and a +weakly-supervised learning approach, which requires only solutions for the +early-fixed linear problems with random assignments for the integer variables. +Our results show a runtime reduction of 71.11% Furthermore, the +weakly-supervised learning model provided significant values for early fixing, +despite never seeing the optimal values during training. + +
+
+ comment: Paper accepted at SBAI 2023 +
+
+
+
+
+ + ♻ ☆ LEVER: Learning to Verify Language-to-Code Generation with Execution ICML'23 + + +
+ The advent of large language models trained on code (code LLMs) has led to +significant progress in language-to-code generation. State-of-the-art +approaches in this area combine LLM decoding with sample pruning and reranking +using test cases or heuristics based on the execution results. However, it is +challenging to obtain test cases for many real-world language-to-code +applications, and heuristics cannot well capture the semantic features of the +execution results, such as data type and value range, which often indicates the +correctness of the program. In this work, we propose LEVER, a simple approach +to improve language-to-code generation by learning to verify the generated +programs with their execution results. Specifically, we train verifiers to +determine whether a program sampled from the LLMs is correct or not based on +the natural language input, the program itself and its execution results. The +sampled programs are reranked by combining the verification score with the LLM +generation probability, and marginalizing over programs with the same execution +results. On four datasets across the domains of table QA, math QA and basic +Python programming, LEVER consistently improves over the base code LLMs(4.6% to +10.9% with code-davinci-002) and achieves new state-of-the-art results on all +of them. + +
+
+ comment: ICML'23; code available at https://github.com/niansong1996/lever +
+
+
+
+
+ + ♻ ☆ Neural-network quantum state study of the long-range antiferromagnetic + Ising chain + + +
+ We investigate quantum phase transitions in the transverse field Ising chain +with algebraically decaying long-range antiferromagnetic interactions by using +the variational Monte Carlo method with the restricted Boltzmann machine being +employed as a trial wave function ansatz. In the finite-size scaling analysis +with the order parameter and the second R\'enyi entropy, we find that the +central charge deviates from 1/2 at a small decay exponent $\alpha_\mathrm{LR}$ +in contrast to the critical exponents staying very close to the short-range +(SR) Ising values regardless of $\alpha_\mathrm{LR}$ examined, supporting the +previously proposed scenario of conformal invariance breakdown. To identify the +threshold of the Ising universality and the conformal symmetry, we perform two +additional tests for the universal Binder ratio and the conformal field theory +(CFT) description of the correlation function. It turns out that both indicate +a noticeable deviation from the SR Ising class at $\alpha_\mathrm{LR} < 2$. +However, a closer look at the scaled correlation function for +$\alpha_\mathrm{LR} \ge 2$ shows a gradual change from the asymptotic line of +the CFT verified at $\alpha_\mathrm{LR} = 3$, providing a rough estimate of the +threshold being in the range of $2 \lesssim \alpha_\mathrm{LR} < 3$. + +
+
+
+
+
+ + ♻ ☆ Activation Addition: Steering Language Models Without Optimization + + +
+ Reliably controlling the behavior of large language models is a pressing open +problem. Existing methods include supervised finetuning, reinforcement learning +from human feedback, prompt engineering, and guided decoding. We instead +investigate activation engineering: modifying activations at inference time to +predictably alter model behavior. In particular, we bias the forward pass with +an added 'steering vector' implicitly specified through natural language. + Unlike past work which learned these steering vectors, our Activation +Addition (ActAdd) method computes them by taking the activation differences +that result from pairs of prompts. We demonstrate ActAdd on GPT-2 on +OpenWebText and ConceptNet. Our inference-time approach yields control over +high-level properties of output and preserves off-target model performance. It +involves far less compute and implementation effort than finetuning, allows +users to provide natural language specifications, and its overhead scales +naturally with model size. + +
+
+
+
+
+ + ♻ ☆ Mapping the landscape of histomorphological cancer phenotypes using + self-supervised learning on unlabeled, unannotated pathology slides + + +
+ Definitive cancer diagnosis and management depend upon the extraction of +information from microscopy images by pathologists. These images contain +complex information requiring time-consuming expert human interpretation that +is prone to human bias. Supervised deep learning approaches have proven +powerful for classification tasks, but they are inherently limited by the cost +and quality of annotations used for training these models. To address this +limitation of supervised methods, we developed Histomorphological Phenotype +Learning (HPL), a fully blue{self-}supervised methodology that requires no +expert labels or annotations and operates via the automatic discovery of +discriminatory image features in small image tiles. Tiles are grouped into +morphologically similar clusters which constitute a library of +histomorphological phenotypes, revealing trajectories from benign to malignant +tissue via inflammatory and reactive phenotypes. These clusters have distinct +features which can be identified using orthogonal methods, linking histologic, +molecular and clinical phenotypes. Applied to lung cancer tissues, we show that +they align closely with patient survival, with histopathologically recognised +tumor types and growth patterns, and with transcriptomic measures of +immunophenotype. We then demonstrate that these properties are maintained in a +multi-cancer study. These results show the clusters represent recurrent host +responses and modes of tumor growth emerging under natural selection. Code, +pre-trained models, learned embeddings, and documentation are available to the +community at +https://github.com/AdalbertoCq/Histomorphological-Phenotype-Learning + +
+
+
+
+
+ + ♻ ☆ Causal Policy Gradient for Whole-Body Mobile Manipulation + + +
+ Developing the next generation of household robot helpers requires combining +locomotion and interaction capabilities, which is generally referred to as +mobile manipulation (MoMa). MoMa tasks are difficult due to the large action +space of the robot and the common multi-objective nature of the task, e.g., +efficiently reaching a goal while avoiding obstacles. Current approaches often +segregate tasks into navigation without manipulation and stationary +manipulation without locomotion by manually matching parts of the action space +to MoMa sub-objectives (e.g. base actions for locomotion objectives and arm +actions for manipulation). This solution prevents simultaneous combinations of +locomotion and interaction degrees of freedom and requires human domain +knowledge for both partitioning the action space and matching the action parts +to the sub-objectives. In this paper, we introduce Causal MoMa, a new framework +to train policies for typical MoMa tasks that makes use of the most favorable +subspace of the robot's action space to address each sub-objective. Causal MoMa +automatically discovers the causal dependencies between actions and terms of +the reward function and exploits these dependencies in a causal policy learning +procedure that reduces gradient variance compared to previous state-of-the-art +policy gradient algorithms, improving convergence and results. We evaluate the +performance of Causal MoMa on three types of simulated robots across different +MoMa tasks and demonstrate success in transferring the policies trained in +simulation directly to a real robot, where our agent is able to follow moving +goals and react to dynamic obstacles while simultaneously and synergistically +controlling the whole-body: base, arm, and head. More information at +https://sites.google.com/view/causal-moma. + +
+
+
+
+
+ + ♻ ☆ Graph Structural Residuals: A Learning Approach to Diagnosis + + +
+ Traditional model-based diagnosis relies on constructing explicit system +models, a process that can be laborious and expertise-demanding. In this paper, +we propose a novel framework that combines concepts of model-based diagnosis +with deep graph structure learning. This data-driven approach leverages data to +learn the system's underlying structure and provide dynamic observations, +represented by two distinct graph adjacency matrices. Our work facilitates a +seamless integration of graph structure learning with model-based diagnosis by +making three main contributions: (i) redefining the constructs of system +representation, observations, and faults (ii) introducing two distinct versions +of a self-supervised graph structure learning model architecture and (iii) +demonstrating the potential of our data-driven diagnostic method through +experiments on a system of coupled oscillators. + +
+
+
+
+
+ + ♻ ☆ Euler Characteristic Tools For Topological Data Analysis + + +
+ In this article, we study Euler characteristic techniques in topological data +analysis. Pointwise computing the Euler characteristic of a family of +simplicial complexes built from data gives rise to the so-called Euler +characteristic profile. We show that this simple descriptor achieve +state-of-the-art performance in supervised tasks at a very low computational +cost. Inspired by signal analysis, we compute hybrid transforms of Euler +characteristic profiles. These integral transforms mix Euler characteristic +techniques with Lebesgue integration to provide highly efficient compressors of +topological signals. As a consequence, they show remarkable performances in +unsupervised settings. On the qualitative side, we provide numerous heuristics +on the topological and geometric information captured by Euler profiles and +their hybrid transforms. Finally, we prove stability results for these +descriptors as well as asymptotic guarantees in random settings. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ♻ ☆ STEm-Seg: Spatio-temporal Embeddings for Instance Segmentation in Videos ECCV 2020 + + +
+ Existing methods for instance segmentation in videos typically involve +multi-stage pipelines that follow the tracking-by-detection paradigm and model +a video clip as a sequence of images. Multiple networks are used to detect +objects in individual frames, and then associate these detections over time. +Hence, these methods are often non-end-to-end trainable and highly tailored to +specific tasks. In this paper, we propose a different approach that is +well-suited to a variety of tasks involving instance segmentation in videos. In +particular, we model a video clip as a single 3D spatio-temporal volume, and +propose a novel approach that segments and tracks instances across space and +time in a single stage. Our problem formulation is centered around the idea of +spatio-temporal embeddings which are trained to cluster pixels belonging to a +specific object instance over an entire video clip. To this end, we introduce +(i) novel mixing functions that enhance the feature representation of +spatio-temporal embeddings, and (ii) a single-stage, proposal-free network that +can reason about temporal context. Our network is trained end-to-end to learn +spatio-temporal embeddings as well as parameters required to cluster these +embeddings, thus simplifying inference. Our method achieves state-of-the-art +results across multiple datasets and tasks. Code and models are available at +https://github.com/sabarim/STEm-Seg. + +
+
+ comment: ECCV 2020 28 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Multi-granulariy Time-based Transformer for Knowledge Tracing + + +
+ In this paper, we present a transformer architecture for predicting student +performance on standardized tests. Specifically, we leverage students +historical data, including their past test scores, study habits, and other +relevant information, to create a personalized model for each student. We then +use these models to predict their future performance on a given test. Applying +this model to the RIIID dataset, we demonstrate that using multiple +granularities for temporal features as the decoder input significantly improve +model performance. Our results also show the effectiveness of our approach, +with substantial improvements over the LightGBM method. Our work contributes to +the growing field of AI in education, providing a scalable and accurate tool +for predicting student outcomes. + +
+
+
+
+
+ + ♻ ☆ CLIPAG: Towards Generator-Free Text-to-Image Generation + + +
+ Perceptually Aligned Gradients (PAG) refer to an intriguing property observed +in robust image classification models, wherein their input gradients align with +human perception and pose semantic meanings. While this phenomenon has gained +significant research attention, it was solely studied in the context of +unimodal vision-only architectures. In this work, we extend the study of PAG to +Vision-Language architectures, which form the foundations for diverse +image-text tasks and applications. Through an adversarial robustification +finetuning of CLIP, we demonstrate that robust Vision-Language models exhibit +PAG in contrast to their vanilla counterparts. This work reveals the merits of +CLIP with PAG (CLIPAG) in several vision-language generative tasks. Notably, we +show that seamlessly integrating CLIPAG in a "plug-n-play" manner leads to +substantial improvements in vision-language generative applications. +Furthermore, leveraging its PAG property, CLIPAG enables text-to-image +generation without any generative model, which typically requires huge +generators. + +
+
+
+
+
+ + ♻ ☆ Materials Informatics Transformer: A Language Model for Interpretable + Materials Properties Prediction + + +
+ Recently, the remarkable capabilities of large language models (LLMs) have +been illustrated across a variety of research domains such as natural language +processing, computer vision, and molecular modeling. We extend this paradigm by +utilizing LLMs for material property prediction by introducing our model +Materials Informatics Transformer (MatInFormer). Specifically, we introduce a +novel approach that involves learning the grammar of crystallography through +the tokenization of pertinent space group information. We further illustrate +the adaptability of MatInFormer by incorporating task-specific data pertaining +to Metal-Organic Frameworks (MOFs). Through attention visualization, we uncover +the key features that the model prioritizes during property prediction. The +effectiveness of our proposed model is empirically validated across 14 distinct +datasets, hereby underscoring its potential for high throughput screening +through accurate material property prediction. + +
+
+
+
+
+ + ♻ ☆ Diversified Ensemble of Independent Sub-Networks for Robust + Self-Supervised Representation Learning + + +
+ Ensembling a neural network is a widely recognized approach to enhance model +performance, estimate uncertainty, and improve robustness in deep supervised +learning. However, deep ensembles often come with high computational costs and +memory demands. In addition, the efficiency of a deep ensemble is related to +diversity among the ensemble members which is challenging for large, +over-parameterized deep neural networks. Moreover, ensemble learning has not +yet seen such widespread adoption, and it remains a challenging endeavor for +self-supervised or unsupervised representation learning. Motivated by these +challenges, we present a novel self-supervised training regime that leverages +an ensemble of independent sub-networks, complemented by a new loss function +designed to encourage diversity. Our method efficiently builds a sub-model +ensemble with high diversity, leading to well-calibrated estimates of model +uncertainty, all achieved with minimal computational overhead compared to +traditional deep self-supervised ensembles. To evaluate the effectiveness of +our approach, we conducted extensive experiments across various tasks, +including in-distribution generalization, out-of-distribution detection, +dataset corruption, and semi-supervised settings. The results demonstrate that +our method significantly improves prediction reliability. Our approach not only +achieves excellent accuracy but also enhances calibration, surpassing baseline +performance across a wide range of self-supervised architectures in computer +vision, natural language processing, and genomics data. + +
+
+
+
+
+ + ♻ ☆ Communication-Efficient Distributed Deep Learning: A Comprehensive + Survey + + +
+ Distributed deep learning (DL) has become prevalent in recent years to reduce +training time by leveraging multiple computing devices (e.g., GPUs/TPUs) due to +larger models and datasets. However, system scalability is limited by +communication becoming the performance bottleneck. Addressing this +communication issue has become a prominent research topic. In this paper, we +provide a comprehensive survey of the communication-efficient distributed +training algorithms, focusing on both system-level and algorithmic-level +optimizations. We first propose a taxonomy of data-parallel distributed +training algorithms that incorporates four primary dimensions: communication +synchronization, system architectures, compression techniques, and parallelism +of communication and computing tasks. We then investigate state-of-the-art +studies that address problems in these four dimensions. We also compare the +convergence rates of different algorithms to understand their convergence +speed. Additionally, we conduct extensive experiments to empirically compare +the convergence performance of various mainstream distributed training +algorithms. Based on our system-level communication cost analysis, theoretical +and experimental convergence speed comparison, we provide readers with an +understanding of which algorithms are more efficient under specific distributed +environments. Our research also extrapolates potential directions for further +optimizations. + +
+
+
+
+
+ + ♻ ☆ Topology-aware Tensor Decomposition for Meta-graph Learning + + +
+ Heterogeneous graphs generally refers to graphs with different types of nodes +and edges. A common approach for extracting useful information from +heterogeneous graphs is to use meta-graphs, which can be seen as a special kind +of directed acyclic graph (DAG) with same node and edge types as the +heterogeneous graph. However, how to design proper meta-graphs is challenging. +Recently, there have been many works on learning suitable meta-graphs from a +heterogeneous graph. Existing methods generally introduce continuous weights +for edges that are independent of each other, which ignores the topological +stucture of meta-graphs and can be ineffective. To address this issue, we +propose a new viewpoint from tensor on learning meta-graphs. Such a viewpoint +not only helps interpret the limitation of existing works by CANDECOMP/PARAFAC +(CP) decomposition, but also inspires us to propose a topology-aware tensor +decomposition, called TENSUS, that reflects the structure of DAGs. The proposed +topology-aware tensor decomposition is easy to use and simple to implement, and +it can be taken as a plug-in part to upgrade many existing works, including +node classification and recommendation on heterogeneous graphs. Experimental +results on different tasks demonstrate that the proposed method can +significantly improve the state-of-the-arts for all these tasks. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Empirical Evaluation on Online Continual Learning ICCV + + +
+ Online continual learning aims to get closer to a live learning experience by +learning directly on a stream of data with temporally shifting distribution and +by storing a minimum amount of data from that stream. In this empirical +evaluation, we evaluate various methods from the literature that tackle online +continual learning. More specifically, we focus on the class-incremental +setting in the context of image classification, where the learner must learn +new classes incrementally from a stream of data. We compare these methods on +the Split-CIFAR100 and Split-TinyImagenet benchmarks, and measure their average +accuracy, forgetting, stability, and quality of the representations, to +evaluate various aspects of the algorithm at the end but also during the whole +training period. We find that most methods suffer from stability and +underfitting issues. However, the learned representations are comparable to +i.i.d. training under the same computational budget. No clear winner emerges +from the results and basic experience replay, when properly tuned and +implemented, is a very strong baseline. We release our modular and extensible +codebase at https://github.com/AlbinSou/ocl_survey based on the avalanche +framework to reproduce our results and encourage future research. + +
+
+ comment: ICCV Visual Continual Learning Workshop 2023 accepted paper +
+
+
+
+
+ + ♻ ☆ GNFactor: Multi-Task Real Robot Learning with Generalizable Neural + Feature Fields + + +
+ It is a long-standing problem in robotics to develop agents capable of +executing diverse manipulation tasks from visual observations in unstructured +real-world environments. To achieve this goal, the robot needs to have a +comprehensive understanding of the 3D structure and semantics of the scene. In +this work, we present $\textbf{GNFactor}$, a visual behavior cloning agent for +multi-task robotic manipulation with $\textbf{G}$eneralizable $\textbf{N}$eural +feature $\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural +field (GNF) as a reconstruction module and a Perceiver Transformer as a +decision-making module, leveraging a shared deep 3D voxel representation. To +incorporate semantics in 3D, the reconstruction module utilizes a +vision-language foundation model ($\textit{e.g.}$, Stable Diffusion) to distill +rich semantic information into the deep 3D voxel. We evaluate GNFactor on 3 +real robot tasks and perform detailed ablations on 10 RLBench tasks with a +limited number of demonstrations. We observe a substantial improvement of +GNFactor over current state-of-the-art methods in seen and unseen tasks, +demonstrating the strong generalization ability of GNFactor. Our project +website is https://yanjieze.com/GNFactor/ . + +
+
+ comment: CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/ +
+
+
+
+
+ + ♻ ☆ LaserMix for Semi-Supervised LiDAR Semantic Segmentation CVPR 2023 + + +
+ Densely annotating LiDAR point clouds is costly, which restrains the +scalability of fully-supervised learning methods. In this work, we study the +underexplored semi-supervised learning (SSL) in LiDAR segmentation. Our core +idea is to leverage the strong spatial cues of LiDAR point clouds to better +exploit unlabeled data. We propose LaserMix to mix laser beams from different +LiDAR scans, and then encourage the model to make consistent and confident +predictions before and after mixing. Our framework has three appealing +properties: 1) Generic: LaserMix is agnostic to LiDAR representations (e.g., +range view and voxel), and hence our SSL framework can be universally applied. +2) Statistically grounded: We provide a detailed analysis to theoretically +explain the applicability of the proposed framework. 3) Effective: +Comprehensive experimental analysis on popular LiDAR segmentation datasets +(nuScenes, SemanticKITTI, and ScribbleKITTI) demonstrates our effectiveness and +superiority. Notably, we achieve competitive results over fully-supervised +counterparts with 2x to 5x fewer labels and improve the supervised-only +baseline significantly by 10.8% on average. We hope this concise yet +high-performing framework could facilitate future research in semi-supervised +LiDAR segmentation. Code is publicly available. + +
+
+ comment: CVPR 2023 (Highlight); 27 pages, 11 figures, 12 tables; Code at + https://github.com/ldkong1205/LaserMix +
+
+
+
+
+ + ♻ ☆ A Penalty-Based Method for Communication-Efficient Decentralized Bilevel + Programming + + +
+ Bilevel programming has recently received attention in the literature, due to +its wide range of applications, including reinforcement learning and +hyper-parameter optimization. However, it is widely assumed that the underlying +bilevel optimization problem is solved either by a single machine or in the +case of multiple machines connected in a star-shaped network, i.e., federated +learning setting. The latter approach suffers from a high communication cost on +the central node (e.g., parameter server) and exhibits privacy vulnerabilities. +Hence, it is of interest to develop methods that solve bilevel optimization +problems in a communication-efficient decentralized manner. To that end, this +paper introduces a penalty function based decentralized algorithm with +theoretical guarantees for this class of optimization problems. Specifically, a +distributed alternating gradient-type algorithm for solving consensus bilevel +programming over a decentralized network is developed. A key feature of the +proposed algorithm is to estimate the hyper-gradient of the penalty function +via decentralized computation of matrix-vector products and few vector +communications, which is then integrated within an alternating algorithm to +obtain finite-time convergence analysis under different convexity assumptions. +Our theoretical result highlights improvements in the iteration complexity of +decentralized bilevel optimization, all while making efficient use of vector +communication. Empirical results on both synthetic and real datasets +demonstrate that the proposed method performs well in real-world settings. + +
+
+
+
+
+ + ♻ ☆ Neural Augmented Kalman Filtering with Bollinger Bands for Pairs Trading + + +
+ Pairs trading is a family of trading techniques that determine their policies +based on monitoring the relationships between pairs of assets. A common pairs +trading approach relies on describing the pair-wise relationship as a linear +Space State (SS) model with Gaussian noise. This representation facilitates +extracting financial indicators with low complexity and latency using a Kalman +Filter (KF), that are then processed using classic policies such as Bollinger +Bands (BB). However, such SS models are inherently approximated and mismatched, +often degrading the revenue. In this work, we propose KalmenNet-aided Bollinger +bands Pairs Trading (KBPT), a deep learning aided policy that augments the +operation of KF-aided BB trading. KBPT is designed by formulating an extended +SS model for pairs trading that approximates their relationship as holding +partial co-integration. This SS model is utilized by a trading policy that +augments KF-BB trading with a dedicated neural network based on the KalmanNet +architecture. The resulting KBPT is trained in a two-stage manner which first +tunes the tracking algorithm in an unsupervised manner independently of the +trading task, followed by its adaptation to track the financial indicators to +maximize revenue while approximating BB with a differentiable mapping. KBPT +thus leverages data to overcome the approximated nature of the SS model, +converting the KF-BB policy into a trainable model. We empirically demonstrate +that our proposed KBPT systematically yields improved revenue compared with +model-based and data-driven benchmarks over various different assets. + +
+
+ comment: Submitted to Transactions on Signal Processing +
+
+
+
+
+ + ♻ ☆ Almost-Orthogonal Layers for Efficient General-Purpose Lipschitz + Networks + + +
+ It is a highly desirable property for deep networks to be robust against +small input changes. One popular way to achieve this property is by designing +networks with a small Lipschitz constant. In this work, we propose a new +technique for constructing such Lipschitz networks that has a number of +desirable properties: it can be applied to any linear network layer +(fully-connected or convolutional), it provides formal guarantees on the +Lipschitz constant, it is easy to implement and efficient to run, and it can be +combined with any training objective and optimization method. In fact, our +technique is the first one in the literature that achieves all of these +properties simultaneously. Our main contribution is a rescaling-based weight +matrix parametrization that guarantees each network layer to have a Lipschitz +constant of at most 1 and results in the learned weight matrices to be close to +orthogonal. Hence we call such layers almost-orthogonal Lipschitz (AOL). +Experiments and ablation studies in the context of image classification with +certified robust accuracy confirm that AOL layers achieve results that are on +par with most existing methods. Yet, they are simpler to implement and more +broadly applicable, because they do not require computationally expensive +matrix orthogonalization or inversion steps as part of the network +architecture. We provide code at https://github.com/berndprach/AOL. + +
+
+ comment: - Corrected the results from competitor ECO. - Corrected a typo in + the loss function equation +
+
+
+
+
+ + ♻ ☆ Domain-Agnostic Molecular Generation with Self-feedback + + +
+ The generation of molecules with desired properties has gained tremendous +popularity, revolutionizing the way scientists design molecular structures and +providing valuable support for chemical and drug design. However, despite the +potential of language models in molecule generation, they face numerous +challenges such as the generation of syntactically or chemically flawed +molecules, narrow domain focus, and limitations in creating diverse and +directionally feasible molecules due to a dearth of annotated data or external +molecular databases. To this end, we introduce MolGen, a pre-trained molecular +language model tailored specifically for molecule generation. MolGen acquires +intrinsic structural and grammatical insights by reconstructing over 100 +million molecular SELFIES, while facilitating knowledge transfer between +different domains through domain-agnostic molecular prefix tuning. Moreover, we +present a self-feedback paradigm that inspires the pre-trained model to align +with the ultimate goal of producing molecules with desirable properties. +Extensive experiments on well-known benchmarks confirm MolGen's optimization +capabilities, encompassing penalized logP, QED, and molecular docking +properties. Further analysis shows that MolGen can accurately capture molecule +distributions, implicitly learn their structural characteristics, and +efficiently explore chemical space. The pre-trained model, codes, and datasets +are publicly available for future research at https://github.com/zjunlp/MolGen. + +
+
+ comment: Work in progress. Add results of binding affinity +
+
+
+
+
+ + ♻ ☆ Interpretable Outlier Summarization + + +
+ Outlier detection is critical in real applications to prevent financial +fraud, defend network intrusions, or detecting imminent device failures. To +reduce the human effort in evaluating outlier detection results and effectively +turn the outliers into actionable insights, the users often expect a system to +automatically produce interpretable summarizations of subgroups of outlier +detection results. Unfortunately, to date no such systems exist. To fill this +gap, we propose STAIR which learns a compact set of human understandable rules +to summarize and explain the anomaly detection results. Rather than use the +classical decision tree algorithms to produce these rules, STAIR proposes a new +optimization objective to produce a small number of rules with least +complexity, hence strong interpretability, to accurately summarize the +detection results. The learning algorithm of STAIR produces a rule set by +iteratively splitting the large rules and is optimal in maximizing this +objective in each iteration. Moreover, to effectively handle high dimensional, +highly complex data sets which are hard to summarize with simple rules, we +propose a localized STAIR approach, called L-STAIR. Taking data locality into +consideration, it simultaneously partitions data and learns a set of localized +rules for each partition. Our experimental study on many outlier benchmark +datasets shows that STAIR significantly reduces the complexity of the rules +required to summarize the outlier detection results, thus more amenable for +humans to understand and evaluate, compared to the decision tree methods. + +
+
+
+
+
+ + ♻ ☆ Simulation comparisons between Bayesian and de-biased estimators in + low-rank matrix completion + + +
+ In this paper, we study the low-rank matrix completion problem, a class of +machine learning problems, that aims at the prediction of missing entries in a +partially observed matrix. Such problems appear in several challenging +applications such as collaborative filtering, image processing, and genotype +imputation. We compare the Bayesian approaches and a recently introduced +de-biased estimator which provides a useful way to build confidence intervals +of interest. From a theoretical viewpoint, the de-biased estimator comes with a +sharp minimax-optimal rate of estimation error whereas the Bayesian approach +reaches this rate with an additional logarithmic factor. Our simulation studies +show originally interesting results that the de-biased estimator is just as +good as the Bayesian estimators. Moreover, Bayesian approaches are much more +stable and can outperform the de-biased estimator in the case of small samples. +In addition, we also find that the empirical coverage rate of the confidence +intervals obtained by the de-biased estimator for an entry is absolutely lower +than of the considered credible interval. These results suggest further +theoretical studies on the estimation error and the concentration of Bayesian +methods as they are quite limited up to present. + +
+
+
+
+
+ + ♻ ☆ Robust Networked Federated Learning for Localization + + +
+ This paper addresses the problem of localization, which is inherently +non-convex and non-smooth in a federated setting where the data is distributed +across a multitude of devices. Due to the decentralized nature of federated +environments, distributed learning becomes essential for scalability and +adaptability. Moreover, these environments are often plagued by outlier data, +which presents substantial challenges to conventional methods, particularly in +maintaining estimation accuracy and ensuring algorithm convergence. To mitigate +these challenges, we propose a method that adopts an $L_1$-norm robust +formulation within a distributed sub-gradient framework, explicitly designed to +handle these obstacles. Our approach addresses the problem in its original +form, without resorting to iterative simplifications or approximations, +resulting in enhanced computational efficiency and improved estimation +accuracy. We demonstrate that our method converges to a stationary point, +highlighting its effectiveness and reliability. Through numerical simulations, +we confirm the superior performance of our approach, notably in outlier-rich +environments, which surpasses existing state-of-the-art localization methods. + +
+
+
+
+
+ + ♻ ☆ Irregular Traffic Time Series Forecasting Based on Asynchronous + Spatio-Temporal Graph Convolutional Network + + +
+ Accurate traffic forecasting at intersections governed by intelligent traffic +signals is critical for the advancement of an effective intelligent traffic +signal control system. However, due to the irregular traffic time series +produced by intelligent intersections, the traffic forecasting task becomes +much more intractable and imposes three major new challenges: 1) asynchronous +spatial dependency, 2) irregular temporal dependency among traffic data, and 3) +variable-length sequence to be predicted, which severely impede the performance +of current traffic forecasting methods. To this end, we propose an Asynchronous +Spatio-tEmporal graph convolutional nEtwoRk (ASeer) to predict the traffic +states of the lanes entering intelligent intersections in a future time window. +Specifically, by linking lanes via a traffic diffusion graph, we first propose +an Asynchronous Graph Diffusion Network to model the asynchronous spatial +dependency between the time-misaligned traffic state measurements of lanes. +After that, to capture the temporal dependency within irregular traffic state +sequence, a learnable personalized time encoding is devised to embed the +continuous time for each lane. Then we propose a Transformable Time-aware +Convolution Network that learns meta-filters to derive time-aware convolution +filters with transformable filter sizes for efficient temporal convolution on +the irregular sequence. Furthermore, a Semi-Autoregressive Prediction Network +consisting of a state evolution unit and a semiautoregressive predictor is +designed to effectively and efficiently predict variable-length traffic state +sequences. Extensive experiments on two real-world datasets demonstrate the +effectiveness of ASeer in six metrics. + +
+
+
+
+
+ + ♻ ☆ Test Time Embedding Normalization for Popularity Bias Mitigation CIKM 2023 + + +
+ Popularity bias is a widespread problem in the field of recommender systems, +where popular items tend to dominate recommendation results. In this work, we +propose 'Test Time Embedding Normalization' as a simple yet effective strategy +for mitigating popularity bias, which surpasses the performance of the previous +mitigation approaches by a significant margin. Our approach utilizes the +normalized item embedding during the inference stage to control the influence +of embedding magnitude, which is highly correlated with item popularity. +Through extensive experiments, we show that our method combined with the +sampled softmax loss effectively reduces popularity bias compare to previous +approaches for bias mitigation. We further investigate the relationship between +user and item embeddings and find that the angular similarity between +embeddings distinguishes preferable and non-preferable items regardless of +their popularity. The analysis explains the mechanism behind the success of our +approach in eliminating the impact of popularity bias. Our code is available at +https://github.com/ml-postech/TTEN. + +
+
+ comment: 5 pages, CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Improving Differentiable Architecture Search via Self-Distillation + + +
+ Differentiable Architecture Search (DARTS) is a simple yet efficient Neural +Architecture Search (NAS) method. During the search stage, DARTS trains a +supernet by jointly optimizing architecture parameters and network parameters. +During the evaluation stage, DARTS discretizes the supernet to derive the +optimal architecture based on architecture parameters. However, recent research +has shown that during the training process, the supernet tends to converge +towards sharp minima rather than flat minima. This is evidenced by the higher +sharpness of the loss landscape of the supernet, which ultimately leads to a +performance gap between the supernet and the optimal architecture. In this +paper, we propose Self-Distillation Differentiable Neural Architecture Search +(SD-DARTS) to alleviate the discretization gap. We utilize self-distillation to +distill knowledge from previous steps of the supernet to guide its training in +the current step, effectively reducing the sharpness of the supernet's loss and +bridging the performance gap between the supernet and the optimal architecture. +Furthermore, we introduce the concept of voting teachers, where multiple +previous supernets are selected as teachers, and their output probabilities are +aggregated through voting to obtain the final teacher prediction. Experimental +results on real datasets demonstrate the advantages of our novel +self-distillation-based NAS method compared to state-of-the-art alternatives. + +
+
+ comment: Accepted by Neural Networks +
+
+
+
+
+ + ♻ ☆ Identifying Generalized Neural Representation Across Hamiltonian + Manifolds via Meta-learning + + +
+ Recent advancements in deep learning for physics have focused on discovering +shared representations of target systems by incorporating physics priors or +inductive biases into neural networks. However, these approaches are +system-specific and do not allow for easy adaptation to new physical systems +governed by different laws. For example, a neural network trained on a +mass-spring system cannot accurately predict the behavior of a two-body system +or any other system with different governing physics. In this work, we model +our system with a graph neural network and employ a meta-learning algorithm to +enable the model to gain experience over a distribution of tasks and make it +adapt to new physics. Our approach aims to learn a general representation +across the various Hamiltonian manifolds, which is a common feature of the data +distribution of Hamiltonian systems. We train our model using a dataset of +different physical systems, each governed by its own inherent dynamics, and +evaluate its performance on a new type of dynamical system with unknown +physics. Our results show that the meta-trained model effectively adapts to the +new system, which was unseen during the meta-training phase. Furthermore, we +analyze the representation learned by the meta-trained neural network to +identify a generalizable representation of Hamilton's equation that is shared +across various physical systems. Our findings suggest that the meta-learned +model can capture the generalizable representation across Hamiltonian manifolds +inherent in dynamical systems. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Improving 3D Imaging with Pre-Trained Perpendicular 2D Diffusion Models ICCV23 + + +
+ Diffusion models have become a popular approach for image generation and +reconstruction due to their numerous advantages. However, most diffusion-based +inverse problem-solving methods only deal with 2D images, and even recently +published 3D methods do not fully exploit the 3D distribution prior. To address +this, we propose a novel approach using two perpendicular pre-trained 2D +diffusion models to solve the 3D inverse problem. By modeling the 3D data +distribution as a product of 2D distributions sliced in different directions, +our method effectively addresses the curse of dimensionality. Our experimental +results demonstrate that our method is highly effective for 3D medical image +reconstruction tasks, including MRI Z-axis super-resolution, compressed sensing +MRI, and sparse-view CT. Our method can generate high-quality voxel volumes +suitable for medical applications. + +
+
+ comment: ICCV23 poster. 15 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ FAM: fast adaptive federated meta-learning + + +
+ In this work, we propose a fast adaptive federated meta-learning (FAM) +framework for collaboratively learning a single global model, which can then be +personalized locally on individual clients. Federated learning enables multiple +clients to collaborate to train a model without sharing data. Clients with +insufficient data or data diversity participate in federated learning to learn +a model with superior performance. Nonetheless, learning suffers when data +distributions diverge. There is a need to learn a global model that can be +adapted using client's specific information to create personalized models on +clients is required. MRI data suffers from this problem, wherein, one, due to +data acquisition challenges, local data at a site is sufficient for training an +accurate model and two, there is a restriction of data sharing due to privacy +concerns and three, there is a need for personalization of a learnt shared +global model on account of domain shift across client sites. The global model +is sparse and captures the common features in the MRI. This skeleton network is +grown on each client to train a personalized model by learning additional +client-specific parameters from local data. Experimental results show that the +personalization process at each client quickly converges using a limited number +of epochs. The personalized client models outperformed the locally trained +models, demonstrating the efficacy of the FAM mechanism. Additionally, the +sparse parameter set to be communicated during federated learning drastically +reduced communication overhead, which makes the scheme viable for networks with +limited resources. + +
+
+ comment: 13 Pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Stochastic Configuration Machines for Industrial Artificial Intelligence + + +
+ Real-time predictive modelling with desired accuracy is highly expected in +industrial artificial intelligence (IAI), where neural networks play a key +role. Neural networks in IAI require powerful, high-performance computing +devices to operate a large number of floating point data. Based on stochastic +configuration networks (SCNs), this paper proposes a new randomized learner +model, termed stochastic configuration machines (SCMs), to stress effective +modelling and data size saving that are useful and valuable for industrial +applications. Compared to SCNs and random vector functional-link (RVFL) nets +with binarized implementation, the model storage of SCMs can be significantly +compressed while retaining favourable prediction performance. Besides the +architecture of the SCM learner model and its learning algorithm, as an +important part of this contribution, we also provide a theoretical basis on the +learning capacity of SCMs by analysing the model's complexity. Experimental +studies are carried out over some benchmark datasets and three industrial +applications. The results demonstrate that SCM has great potential for dealing +with industrial data analytics. + +
+
+ comment: 23 pages, 7 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Rational kernel-based interpolation for complex-valued frequency + response functions + + +
+ This work is concerned with the kernel-based approximation of a +complex-valued function from data, where the frequency response function of a +partial differential equation in the frequency domain is of particular +interest. In this setting, kernel methods are employed more and more +frequently, however, standard kernels do not perform well. Moreover, the role +and mathematical implications of the underlying pair of kernels, which arises +naturally in the complex-valued case, remain to be addressed. We introduce new +reproducing kernel Hilbert spaces of complex-valued functions, and formulate +the problem of complex-valued interpolation with a kernel pair as minimum norm +interpolation in these spaces. Moreover, we combine the interpolant with a +low-order rational function, where the order is adaptively selected based on a +new model selection criterion. Numerical results on examples from different +fields, including electromagnetics and acoustic examples, illustrate the +performance of the method, also in comparison to available rational +approximation methods. + +
+
+ comment: 26 pages main paper, 6 pages supplement +
+
+
+
+
+ + ♻ ☆ Calibrated Explanations for Regression + + +
+ Artificial Intelligence (AI) is often an integral part of modern decision +support systems (DSSs). The best-performing predictive models used in AI-based +DSSs lack transparency. Explainable Artificial Intelligence (XAI) aims to +create AI systems that can explain their rationale to human users. Local +explanations in XAI can provide information about the causes of individual +predictions in terms of feature importance. However, a critical drawback of +existing local explanation methods is their inability to quantify the +uncertainty associated with a feature's importance. This paper introduces an +extension of a feature importance explanation method, Calibrated Explanations +(CE), previously only supporting classification, with support for standard +regression and probabilistic regression, i.e., the probability that the target +is above an arbitrary threshold. The extension for regression keeps all the +benefits of CE, such as calibration of the prediction from the underlying model +with confidence intervals, uncertainty quantification of feature importance, +and allows both factual and counterfactual explanations. CE for standard +regression provides fast, reliable, stable, and robust explanations. CE for +probabilistic regression provides an entirely new way of creating probabilistic +explanations from any ordinary regression model and with a dynamic selection of +thresholds. The performance of CE for probabilistic regression regarding +stability and speed is comparable to LIME. The method is model agnostic with +easily understood conditional rules. An implementation in Python is freely +available on GitHub and for installation using pip making the results in this +paper easily replicable. + +
+
+ comment: 30 pages, 11 figures (replaced due to omitted author, which is the + only change made) +
+
+
+
+
+ + ♻ ☆ Fairness in Preference-based Reinforcement Learning ICML + + +
+ In this paper, we address the issue of fairness in preference-based +reinforcement learning (PbRL) in the presence of multiple objectives. The main +objective is to design control policies that can optimize multiple objectives +while treating each objective fairly. Toward this objective, we design a new +fairness-induced preference-based reinforcement learning or FPbRL. The main +idea of FPbRL is to learn vector reward functions associated with multiple +objectives via new welfare-based preferences rather than reward-based +preference in PbRL, coupled with policy learning via maximizing a generalized +Gini welfare function. Finally, we provide experiment studies on three +different environments to show that the proposed FPbRL approach can achieve +both efficiency and equity for learning effective and fair policies. + +
+
+ comment: Accepted to The Many Facets of Preference Learning Workshop at the + International Conference on Machine Learning (ICML) +
+
+
+
+
+ + ♻ ☆ Task Aware Dreamer for Task Generalization in Reinforcement Learning + + +
+ A long-standing goal of reinforcement learning is to acquire agents that can +learn on training tasks and generalize well on unseen tasks that may share a +similar dynamic but with different reward functions. A general challenge is to +quantitatively measure the similarities between these different tasks, which is +vital for analyzing the task distribution and further designing algorithms with +stronger generalization. To address this, we present a novel metric named Task +Distribution Relevance (TDR) via optimal Q functions of different tasks to +capture the relevance of the task distribution quantitatively. In the case of +tasks with a high TDR, i.e., the tasks differ significantly, we show that the +Markovian policies cannot differentiate them, leading to poor performance. +Based on this insight, we encode all historical information into policies for +distinguishing different tasks and propose Task Aware Dreamer (TAD), which +extends world models into our reward-informed world models to capture invariant +latent features over different tasks. In TAD, we calculate the corresponding +variational lower bound of the data log-likelihood, including a novel term to +distinguish different tasks via states, to optimize reward-informed world +models. Extensive experiments in both image-based control tasks and state-based +control tasks demonstrate that TAD can significantly improve the performance of +handling different tasks simultaneously, especially for those with high TDR, +and demonstrate a strong generalization ability to unseen tasks. + +
+
+
+
+
+ + ♻ ☆ FedDD: Toward Communication-efficient Federated Learning with + Differential Parameter Dropout + + +
+ Federated Learning (FL) requires frequent exchange of model parameters, which +leads to long communication delay, especially when the network environments of +clients vary greatly. Moreover, the parameter server needs to wait for the +slowest client (i.e., straggler, which may have the largest model size, lowest +computing capability or worst network condition) to upload parameters, which +may significantly degrade the communication efficiency. Commonly-used client +selection methods such as partial client selection would lead to the waste of +computing resources and weaken the generalization of the global model. To +tackle this problem, along a different line, in this paper, we advocate the +approach of model parameter dropout instead of client selection, and +accordingly propose a novel framework of Federated learning scheme with +Differential parameter Dropout (FedDD). FedDD consists of two key modules: +dropout rate allocation and uploaded parameter selection, which will optimize +the model parameter uploading ratios tailored to different clients' +heterogeneous conditions and also select the proper set of important model +parameters for uploading subject to clients' dropout rate constraints. +Specifically, the dropout rate allocation is formulated as a convex +optimization problem, taking system heterogeneity, data heterogeneity, and +model heterogeneity among clients into consideration. The uploaded parameter +selection strategy prioritizes on eliciting important parameters for uploading +to speedup convergence. Furthermore, we theoretically analyze the convergence +of the proposed FedDD scheme. Extensive performance evaluations demonstrate +that the proposed FedDD scheme can achieve outstanding performances in both +communication efficiency and model convergence, and also possesses a strong +generalization capability to data of rare classes. + +
+
+
+
+
+ + ♻ ☆ C3: Cross-instance guided Contrastive Clustering BMVC-23 + + +
+ Clustering is the task of gathering similar data samples into clusters +without using any predefined labels. It has been widely studied in machine +learning literature, and recent advancements in deep learning have revived +interest in this field. Contrastive clustering (CC) models are a staple of deep +clustering in which positive and negative pairs of each data instance are +generated through data augmentation. CC models aim to learn a feature space +where instance-level and cluster-level representations of positive pairs are +grouped together. Despite improving the SOTA, these algorithms ignore the +cross-instance patterns, which carry essential information for improving +clustering performance. This increases the false-negative-pair rate of the +model while decreasing its true-positive-pair rate. In this paper, we propose a +novel contrastive clustering method, Cross-instance guided Contrastive +Clustering (C3), that considers the cross-sample relationships to increase the +number of positive pairs and mitigate the impact of false negative, noise, and +anomaly sample on the learned representation of data. In particular, we define +a new loss function that identifies similar instances using the instance-level +representation and encourages them to aggregate together. Moreover, we propose +a novel weighting method to select negative samples in a more efficient way. +Extensive experimental evaluations show that our proposed method can outperform +state-of-the-art algorithms on benchmark computer vision datasets: we improve +the clustering accuracy by 6.6%, 3.3%, 5.0%, 1.3% and 0.3% on CIFAR-10, +CIFAR-100, ImageNet-10, ImageNet-Dogs, and Tiny-ImageNet. + +
+
+ comment: Accepted for publication at the 34th British Machine Vision + Conference (BMVC-23) +
+
+
+
+
+ + ♻ ☆ Tango: rethinking quantization for graph neural network training on GPUs + + +
+ Graph Neural Networks (GNNs) are becoming increasingly popular due to their +superior performance in critical graph-related tasks. While quantization is +widely used to accelerate GNN computation, quantized training faces +unprecedented challenges. Current quantized GNN training systems often have +longer training times than their full-precision counterparts for two reasons: +(i) addressing the accuracy challenge leads to excessive overhead, and (ii) the +optimization potential exposed by quantization is not adequately leveraged. +This paper introduces Tango which re-thinks quantization challenges and +opportunities for graph neural network training on GPUs with three +contributions: Firstly, we introduce efficient rules to maintain accuracy +during quantized GNN training. Secondly, we design and implement +quantization-aware primitives and inter-primitive optimizations that can speed +up GNN training. Finally, we integrate Tango with the popular Deep Graph +Library (DGL) system and demonstrate its superior performance over +state-of-the-art approaches on various GNN models and datasets. + +
+
+
+
+
+ + ♻ ☆ SMDP-Based Dynamic Batching for Efficient Inference on GPU-Based + Platforms + + +
+ In up-to-date machine learning (ML) applications on cloud or edge computing +platforms, batching is an important technique for providing efficient and +economical services at scale. In particular, parallel computing resources on +the platforms, such as graphics processing units (GPUs), have higher +computational and energy efficiency with larger batch sizes. However, larger +batch sizes may also result in longer response time, and thus it requires a +judicious design. This paper aims to provide a dynamic batching policy that +strikes a balance between efficiency and latency. The GPU-based inference +service is modeled as a batch service queue with batch-size dependent +processing time. Then, the design of dynamic batching is a continuous-time +average-cost problem, and is formulated as a semi-Markov decision process +(SMDP) with the objective of minimizing the weighted sum of average response +time and average power consumption. The optimal policy is acquired by solving +an associated discrete-time Markov decision process (MDP) problem with finite +state approximation and "discretization". By introducing an abstract cost to +reflect the impact of "tail" states, the space complexity and the time +complexity of the procedure can decrease by 63.5% and 98%, respectively. Our +results show that the optimal policies potentially possess a control limit +structure. Numerical results also show that SMDP-based batching policies can +adapt to different traffic intensities and outperform other benchmark policies. +Furthermore, the proposed solution has notable flexibility in balancing power +consumption and latency. + +
+
+ comment: Accepted by 2023 IEEE International Conference on Communications + (ICC) +
+
+
+
+
+ + ♻ ☆ Simple and Efficient Heterogeneous Graph Neural Network AAAI 2023 + + +
+ Heterogeneous graph neural networks (HGNNs) have powerful capability to embed +rich structural and semantic information of a heterogeneous graph into node +representations. Existing HGNNs inherit many mechanisms from graph neural +networks (GNNs) over homogeneous graphs, especially the attention mechanism and +the multi-layer structure. These mechanisms bring excessive complexity, but +seldom work studies whether they are really effective on heterogeneous graphs. +This paper conducts an in-depth and detailed study of these mechanisms and +proposes Simple and Efficient Heterogeneous Graph Neural Network (SeHGNN). To +easily capture structural information, SeHGNN pre-computes the neighbor +aggregation using a light-weight mean aggregator, which reduces complexity by +removing overused neighbor attention and avoiding repeated neighbor aggregation +in every training epoch. To better utilize semantic information, SeHGNN adopts +the single-layer structure with long metapaths to extend the receptive field, +as well as a transformer-based semantic fusion module to fuse features from +different metapaths. As a result, SeHGNN exhibits the characteristics of simple +network structure, high prediction accuracy, and fast training speed. Extensive +experiments on five real-world heterogeneous graphs demonstrate the superiority +of SeHGNN over the state-of-the-arts on both accuracy and training speed. + +
+
+ comment: Accepted by AAAI 2023 +
+
+
+
+
+ + ♻ ☆ Efficient and Explainable Graph Neural Architecture Search via + Monte-Carlo Tree Search + + +
+ Graph neural networks (GNNs) are powerful tools for performing data science +tasks in various domains. Although we use GNNs in wide application scenarios, +it is a laborious task for researchers and practitioners to design/select +optimal GNN architectures in diverse graphs. To save human efforts and +computational costs, graph neural architecture search (Graph NAS) has been used +to search for a sub-optimal GNN architecture that combines existing components. +However, there are no existing Graph NAS methods that satisfy explainability, +efficiency, and adaptability to various graphs. Therefore, we propose an +efficient and explainable Graph NAS method, called ExGNAS, which consists of +(i) a simple search space that can adapt to various graphs and (ii) a search +algorithm that makes the decision process explainable. The search space +includes only fundamental functions that can handle homophilic and heterophilic +graphs. The search algorithm efficiently searches for the best GNN architecture +via Monte-Carlo tree search without neural models. The combination of our +search space and algorithm achieves finding accurate GNN models and the +important functions within the search space. We comprehensively evaluate our +method compared with twelve hand-crafted GNN architectures and three Graph NAS +methods in four graphs. Our experimental results show that ExGNAS increases AUC +up to 3.6 and reduces run time up to 78\% compared with the state-of-the-art +Graph NAS methods. Furthermore, we show ExGNAS is effective in analyzing the +difference between GNN architectures in homophilic and heterophilic graphs. + +
+
+
+
+
+ + ♻ ☆ A New Multifractal-based Deep Learning Model for Text Mining + + +
+ In this world full of uncertainty, where the fabric of existence weaves +patterns of complexity, multifractal emerges as beacons of insight, +illuminating them. As we delve into the realm of text mining that underpins +various natural language processing applications and powers a range of +intelligent services, we recognize that behind the veil of text lies a +manifestation of human thought and cognition, intricately intertwined with the +complexities. Building upon the foundation of perceiving text as a complex +system, this study embarks on a journey to unravel the hidden treasures within, +armed with the proposed multifractal method that deciphers the multifractal +attributes embedded within the text landscape. This endeavor culminates in the +birth of our novel model, which also harnesses the power of the proposed +activation function to facilitate nonlinear information transmission within its +neural network architecture. The success on experiments anchored in real-world +technical reports covering the extraction of technical term and classification +of hazard events, stands as a testament to our endeavors. This research venture +not only expands our understanding of text mining but also opens new horizons +for knowledge discovery across various domains. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D + Understanding, Generation, and Instruction Following + + +
+ We introduce Point-Bind, a 3D multi-modality model aligning point clouds with +2D image, language, audio, and video. Guided by ImageBind, we construct a joint +embedding space between 3D and multi-modalities, enabling many promising +applications, e.g., any-to-3D generation, 3D embedding arithmetic, and 3D +open-world understanding. On top of this, we further present Point-LLM, the +first 3D large language model (LLM) following 3D multi-modal instructions. By +parameter-efficient fine-tuning techniques, Point-LLM injects the semantics of +Point-Bind into pre-trained LLMs, e.g., LLaMA, which requires no 3D instruction +data, but exhibits superior 3D and multi-modal question-answering capacity. We +hope our work may cast a light on the community for extending 3D point clouds +to multi-modality applications. Code is available at +https://github.com/ZiyuGuo99/Point-Bind_Point-LLM. + +
+
+ comment: Work in progress. Code is available at + https://github.com/ZiyuGuo99/Point-Bind_Point-LLM +
+
+
+
+
+ + ☆ VideoGen: A Reference-Guided Latent Diffusion Approach for High + Definition Text-to-Video Generation + + +
+ In this paper, we present VideoGen, a text-to-video generation approach, +which can generate a high-definition video with high frame fidelity and strong +temporal consistency using reference-guided latent diffusion. We leverage an +off-the-shelf text-to-image generation model, e.g., Stable Diffusion, to +generate an image with high content quality from the text prompt, as a +reference image to guide video generation. Then, we introduce an efficient +cascaded latent diffusion module conditioned on both the reference image and +the text prompt, for generating latent video representations, followed by a +flow-based temporal upsampling step to improve the temporal resolution. +Finally, we map latent video representations into a high-definition video +through an enhanced video decoder. During training, we use the first frame of a +ground-truth video as the reference image for training the cascaded latent +diffusion module. The main characterises of our approach include: the reference +image generated by the text-to-image model improves the visual fidelity; using +it as the condition makes the diffusion model focus more on learning the video +dynamics; and the video decoder is trained over unlabeled video data, thus +benefiting from high-quality easily-available videos. VideoGen sets a new +state-of-the-art in text-to-video generation in terms of both qualitative and +quantitative evaluation. + +
+
+ comment: 8pages, 8figures +
+
+
+
+
+ + ☆ Towards Contrastive Learning in Music Video Domain + + +
+ Contrastive learning is a powerful way of learning multimodal representations +across various domains such as image-caption retrieval and audio-visual +representation learning. In this work, we investigate if these findings +generalize to the domain of music videos. Specifically, we create a dual +en-coder for the audio and video modalities and train it using a bidirectional +contrastive loss. For the experiments, we use an industry dataset containing +550 000 music videos as well as the public Million Song Dataset, and evaluate +the quality of learned representations on the downstream tasks of music tagging +and genre classification. Our results indicate that pre-trained networks +without contrastive fine-tuning outperform our contrastive learning approach +when evaluated on both tasks. To gain a better understanding of the reasons +contrastive learning was not successful for music videos, we perform a +qualitative analysis of the learned representations, revealing why contrastive +learning might have difficulties uniting embeddings from two modalities. Based +on these findings, we outline possible directions for future work. To +facilitate the reproducibility of our results, we share our code and the +pre-trained model. + +
+
+ comment: 6 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Human-Inspired Facial Sketch Synthesis with Dynamic Adaptation ICCV'23 + + +
+ Facial sketch synthesis (FSS) aims to generate a vivid sketch portrait from a +given facial photo. Existing FSS methods merely rely on 2D representations of +facial semantic or appearance. However, professional human artists usually use +outlines or shadings to covey 3D geometry. Thus facial 3D geometry (e.g. depth +map) is extremely important for FSS. Besides, different artists may use diverse +drawing techniques and create multiple styles of sketches; but the style is +globally consistent in a sketch. Inspired by such observations, in this paper, +we propose a novel Human-Inspired Dynamic Adaptation (HIDA) method. Specially, +we propose to dynamically modulate neuron activations based on a joint +consideration of both facial 3D geometry and 2D appearance, as well as globally +consistent style control. Besides, we use deformable convolutions at +coarse-scales to align deep features, for generating abstract and distinct +outlines. Experiments show that HIDA can generate high-quality sketches in +multiple styles, and significantly outperforms previous methods, over a large +range of challenging faces. Besides, HIDA allows precise style control of the +synthesized sketch, and generalizes well to natural scenes and other artistic +styles. Our code and results have been released online at: +https://github.com/AiArt-HDU/HIDA. + +
+
+ comment: To appear on ICCV'23 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 52 + +
+
+
+ + ☆ PointLLM: Empowering Large Language Models to Understand Point Clouds + + +
+ The unprecedented advancements in Large Language Models (LLMs) have created a +profound impact on natural language processing but are yet to fully embrace the +realm of 3D understanding. This paper introduces PointLLM, a preliminary effort +to fill this gap, thereby enabling LLMs to understand point clouds and offering +a new avenue beyond 2D visual data. PointLLM processes colored object point +clouds with human instructions and generates contextually appropriate +responses, illustrating its grasp of point clouds and common sense. +Specifically, it leverages a point cloud encoder with a powerful LLM to +effectively fuse geometric, appearance, and linguistic information. We collect +a novel dataset comprising 660K simple and 70K complex point-text instruction +pairs to enable a two-stage training strategy: initially aligning latent spaces +and subsequently instruction-tuning the unified model. To rigorously evaluate +our model's perceptual abilities and its generalization capabilities, we +establish two benchmarks: Generative 3D Object Classification and 3D Object +Captioning, assessed through three different methods, including human +evaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment +results show that PointLLM demonstrates superior performance over existing 2D +baselines. Remarkably, in human-evaluated object captioning tasks, PointLLM +outperforms human annotators in over 50% of the samples. Codes, datasets, and +benchmarks are available at https://github.com/OpenRobotLab/PointLLM . + +
+
+ comment: 19 pages. Empowering large language models with 3D point cloud + understanding, accompanied by a novel dataset and carefully designed + benchmarks. Project page: https://runsenxu.com/projects/PointLLM +
+
+
+
+
+ + ☆ Transformers as Support Vector Machines + + +
+ Since its inception in "Attention Is All You Need", transformer architecture +has led to revolutionary advancements in NLP. The attention layer within the +transformer admits a sequence of input tokens $X$ and makes them interact +through pairwise similarities computed as softmax$(XQK^\top X^\top)$, where +$(K,Q)$ are the trainable key-query parameters. In this work, we establish a +formal equivalence between the optimization geometry of self-attention and a +hard-margin SVM problem that separates optimal input tokens from non-optimal +tokens using linear constraints on the outer-products of token pairs. This +formalism allows us to characterize the implicit bias of 1-layer transformers +optimized with gradient descent: (1) Optimizing the attention layer with +vanishing regularization, parameterized by $(K,Q)$, converges in direction to +an SVM solution minimizing the nuclear norm of the combined parameter +$W=KQ^\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm +objective. We characterize this convergence, highlighting that it can occur +toward locally-optimal directions rather than global ones. (2) Complementing +this, we prove the local/global directional convergence of gradient descent +under suitable geometric conditions. Importantly, we show that +over-parameterization catalyzes global convergence by ensuring the feasibility +of the SVM problem and by guaranteeing a benign optimization landscape devoid +of stationary points. (3) While our theory applies primarily to linear +prediction heads, we propose a more general SVM equivalence that predicts the +implicit bias with nonlinear heads. Our findings are applicable to arbitrary +datasets and their validity is verified via experiments. We also introduce +several open problems and research directions. We believe these findings +inspire the interpretation of transformers as a hierarchy of SVMs that +separates and selects optimal tokens. + +
+
+
+
+
+ + ☆ TouchStone: Evaluating Vision-Language Models by Language Models + + +
+ Large vision-language models (LVLMs) have recently witnessed rapid +advancements, exhibiting a remarkable capacity for perceiving, understanding, +and processing visual information by connecting visual receptor with large +language models (LLMs). However, current assessments mainly focus on +recognizing and reasoning abilities, lacking direct evaluation of +conversational skills and neglecting visual storytelling abilities. In this +paper, we propose an evaluation method that uses strong LLMs as judges to +comprehensively evaluate the various abilities of LVLMs. Firstly, we construct +a comprehensive visual dialogue dataset TouchStone, consisting of open-world +images and questions, covering five major categories of abilities and 27 +subtasks. This dataset not only covers fundamental recognition and +comprehension but also extends to literary creation. Secondly, by integrating +detailed image annotations we effectively transform the multimodal input +content into a form understandable by LLMs. This enables us to employ advanced +LLMs for directly evaluating the quality of the multimodal dialogue without +requiring human intervention. Through validation, we demonstrate that powerful +LVLMs, such as GPT-4, can effectively score dialogue quality by leveraging +their textual capabilities alone, aligning with human preferences. We hope our +work can serve as a touchstone for LVLMs' evaluation and pave the way for +building stronger LVLMs. The evaluation code is available at +https://github.com/OFA-Sys/TouchStone. + +
+
+ comment: https://github.com/OFA-Sys/TouchStone +
+
+
+
+
+ + ☆ The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 + Language Variants + + +
+ We present Belebele, a multiple-choice machine reading comprehension (MRC) +dataset spanning 122 language variants. Significantly expanding the language +coverage of natural language understanding (NLU) benchmarks, this dataset +enables the evaluation of text models in high-, medium-, and low-resource +languages. Each question is based on a short passage from the Flores-200 +dataset and has four multiple-choice answers. The questions were carefully +curated to discriminate between models with different levels of general +language comprehension. The English dataset on its own proves difficult enough +to challenge state-of-the-art language models. Being fully parallel, this +dataset enables direct comparison of model performance across all languages. We +use this dataset to evaluate the capabilities of multilingual masked language +models (MLMs) and large language models (LLMs). We present extensive results +and find that despite significant cross-lingual transfer in English-centric +LLMs, much smaller MLMs pretrained on balanced multilingual data still +understand far more languages. We also observe that larger vocabulary size and +conscious vocabulary construction correlate with better performance on +low-resource languages. Overall, Belebele opens up new avenues for evaluating +and analyzing the multilingual capabilities of NLP systems. + +
+
+ comment: 27 pages, 13 figures +
+
+
+
+
+ + ☆ The Gender-GAP Pipeline: A Gender-Aware Polyglot Pipeline for Gender + Characterisation in 55 Languages + + +
+ Gender biases in language generation systems are challenging to mitigate. One +possible source for these biases is gender representation disparities in the +training and evaluation data. Despite recent progress in documenting this +problem and many attempts at mitigating it, we still lack shared methodology +and tooling to report gender representation in large datasets. Such +quantitative reporting will enable further mitigation, e.g., via data +augmentation. This paper describes the Gender-GAP Pipeline (for Gender-Aware +Polyglot Pipeline), an automatic pipeline to characterize gender representation +in large-scale datasets for 55 languages. The pipeline uses a multilingual +lexicon of gendered person-nouns to quantify the gender representation in text. +We showcase it to report gender representation in WMT training data and +development data for the News task, confirming that current data is skewed +towards masculine representation. Having unbalanced datasets may indirectly +optimize our systems towards outperforming one gender over the others. We +suggest introducing our gender quantification pipeline in current datasets and, +ideally, modifying them toward a balanced representation. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Can Programming Languages Boost Each Other via Instruction Tuning? + + +
+ When human programmers have mastered a programming language, it would be +easier when they learn a new programming language. In this report, we focus on +exploring whether programming languages can boost each other during the +instruction fine-tuning phase of code large language models. We conduct +extensive experiments of 8 popular programming languages (Python, JavaScript, +TypeScript, C, C++, Java, Go, HTML) on StarCoder. Results demonstrate that +programming languages can significantly improve each other. For example, +CodeM-Python 15B trained on Python is able to increase Java by an absolute +17.95% pass@1 on HumanEval-X. More surprisingly, we found that CodeM-HTML 7B +trained on the HTML corpus can improve Java by an absolute 15.24% pass@1. Our +training data is released at https://github.com/NL2Code/CodeM. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Simple LLM Prompting is State-of-the-Art for Robust and Multilingual + Dialogue Evaluation + + +
+ Despite significant research effort in the development of automatic dialogue +evaluation metrics, little thought is given to evaluating dialogues other than +in English. At the same time, ensuring metrics are invariant to semantically +similar responses is also an overlooked topic. In order to achieve the desired +properties of robustness and multilinguality for dialogue evaluation metrics, +we propose a novel framework that takes advantage of the strengths of current +evaluation models with the newly-established paradigm of prompting Large +Language Models (LLMs). Empirical results show our framework achieves state of +the art results in terms of mean Spearman correlation scores across several +benchmarks and ranks first place on both the Robust and Multilingual tasks of +the DSTC11 Track 4 "Automatic Evaluation Metrics for Open-Domain Dialogue +Systems", proving the evaluation capabilities of prompted LLMs. + +
+
+ comment: DSTC11 best paper for Track 4 +
+
+
+
+
+ + ☆ Towards Multilingual Automatic Dialogue Evaluation SIGDIAL23 + + +
+ The main limiting factor in the development of robust multilingual dialogue +evaluation metrics is the lack of multilingual data and the limited +availability of open sourced multilingual dialogue systems. In this work, we +propose a workaround for this lack of data by leveraging a strong multilingual +pretrained LLM and augmenting existing English dialogue data using Machine +Translation. We empirically show that the naive approach of finetuning a +pretrained multilingual encoder model with translated data is insufficient to +outperform the strong baseline of finetuning a multilingual model with only +source data. Instead, the best approach consists in the careful curation of +translated data using MT Quality Estimation metrics, excluding low quality +translations that hinder its performance. + +
+
+ comment: SIGDIAL23 +
+
+
+
+
+ + ☆ Enhancing PLM Performance on Labour Market Tasks via Instruction-based + Finetuning and Prompt-tuning with Rules RecSys + + +
+ The increased digitization of the labour market has given researchers, +educators, and companies the means to analyze and better understand the labour +market. However, labour market resources, although available in high volumes, +tend to be unstructured, and as such, research towards methodologies for the +identification, linking, and extraction of entities becomes more and more +important. Against the backdrop of this quest for better labour market +representations, resource constraints and the unavailability of large-scale +annotated data cause a reliance on human domain experts. We demonstrate the +effectiveness of prompt-based tuning of pre-trained language models (PLM) in +labour market specific applications. Our results indicate that cost-efficient +methods such as PTR and instruction tuning without exemplars can significantly +increase the performance of PLMs on downstream labour market applications +without introducing additional model layers, manual annotations, and data +augmentation. + +
+
+ comment: accepted for publication at RecSys in HR 2023 +
+
+
+
+
+ + ☆ Ladder-of-Thought: Using Knowledge as Steps to Elevate Stance Detection + + +
+ Chain-of-Thought Prompting (CoT) reinforces the reasoning capabilities of +Large Language Models (LLMs) through the generation of intermediate rationales. +However, these enhancements predominantly benefit large-scale models, leaving +small LMs without significant performance improvements when directly applying +CoT. Despite the advanced reasoning capabilities of LLMs, CoT relies primarily +on their pre-trained internal knowledge. The external knowledge that is +previously unknown to the model remains unexploited. This omission becomes +pronounced in tasks such as stance detection, where the external background +knowledge plays a pivotal role. Additionally, the large-scale architecture of +LLMs inevitably present efficiency challenges during deployment. To address +these challenges, we introduce the Ladder-of-Thought (LoT) for stance +detection. Grounded in a dual-phase Cascaded Optimization framework, LoT +directs the model to incorporate high-quality external knowledge, enhancing the +intermediate rationales it generates. These bolstered rationales subsequently +serve as the foundation for more precise predictions - akin to how a ladder +facilitates reaching elevated goals. LoT achieves a balance between efficiency +and accuracy, making it an adaptable and efficient framework for stance +detection. Our empirical evaluations underscore LoT's effectiveness, marking a +16% improvement over ChatGPT and a 10% enhancement compared to ChatGPT with +CoT. + +
+
+ comment: 5 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ CReHate: Cross-cultural Re-annotation of English Hate Speech Dataset + + +
+ English datasets predominantly reflect the perspectives of certain +nationalities, which can lead to cultural biases in models and datasets. This +is particularly problematic in tasks heavily influenced by subjectivity, such +as hate speech detection. To delve into how individuals from different +countries perceive hate speech, we introduce CReHate, a cross-cultural +re-annotation of the sampled SBIC dataset. This dataset includes annotations +from five distinct countries: Australia, Singapore, South Africa, the United +Kingdom, and the United States. Our thorough statistical analysis highlights +significant differences based on nationality, with only 59.4% of the samples +achieving consensus among all countries. We also introduce a culturally +sensitive hate speech classifier via transfer learning, adept at capturing +perspectives of different nationalities. These findings underscore the need to +re-evaluate certain aspects of NLP research, especially with regard to the +nuanced nature of hate speech in the English language. + +
+
+
+
+
+ + ☆ SpeechTokenizer: Unified Speech Tokenizer for Speech Large Language + Models + + +
+ Current speech large language models build upon discrete speech +representations, which can be categorized into semantic tokens and acoustic +tokens. However, existing speech tokens are not specifically designed for +speech language modeling. To assess the suitability of speech tokens for +building speech language models, we established the first benchmark, +SLMTokBench. Our results indicate that neither semantic nor acoustic tokens are +ideal for this purpose. Therefore, we propose SpeechTokenizer, a unified speech +tokenizer for speech large language models. SpeechTokenizer adopts the +Encoder-Decoder architecture with residual vector quantization (RVQ). Unifying +semantic and acoustic tokens, SpeechTokenizer disentangles different aspects of +speech information hierarchically across different RVQ layers. Furthermore, We +construct a Unified Speech Language Model (USLM) leveraging SpeechTokenizer. +Experiments show that SpeechTokenizer performs comparably to EnCodec in speech +reconstruction and demonstrates strong performance on the SLMTokBench +benchmark. Also, USLM outperforms VALL-E in zero-shot Text-to-Speech tasks. +Code and models are available at +https://github.com/ZhangXInFD/SpeechTokenizer/. + +
+
+ comment: SpeechTokenizer project page is + https://0nutation.github.io/SpeechTokenizer.github.io/ +
+
+
+
+
+ + ☆ Using Large Language Models to Automate Category and Trend Analysis of + Scientific Articles: An Application in Ophthalmology + + +
+ Purpose: In this paper, we present an automated method for article +classification, leveraging the power of Large Language Models (LLM). The +primary focus is on the field of ophthalmology, but the model is extendable to +other fields. Methods: We have developed a model based on Natural Language +Processing (NLP) techniques, including advanced LLMs, to process and analyze +the textual content of scientific papers. Specifically, we have employed +zero-shot learning (ZSL) LLM models and compared against Bidirectional and +Auto-Regressive Transformers (BART) and its variants, and Bidirectional Encoder +Representations from Transformers (BERT), and its variant such as distilBERT, +SciBERT, PubmedBERT, BioBERT. Results: The classification results demonstrate +the effectiveness of LLMs in categorizing large number of ophthalmology papers +without human intervention. Results: To evalute the LLMs, we compiled a dataset +(RenD) of 1000 ocular disease-related articles, which were expertly annotated +by a panel of six specialists into 15 distinct categories. The model achieved +mean accuracy of 0.86 and mean F1 of 0.85 based on the RenD dataset. +Conclusion: The proposed framework achieves notable improvements in both +accuracy and efficiency. Its application in the domain of ophthalmology +showcases its potential for knowledge organization and retrieval in other +domains too. We performed trend analysis that enables the researchers and +clinicians to easily categorize and retrieve relevant papers, saving time and +effort in literature review and information gathering as well as identification +of emerging scientific trends within different disciplines. Moreover, the +extendibility of the model to other scientific fields broadens its impact in +facilitating research and trend analysis across diverse disciplines. + +
+
+
+
+
+ + ☆ DictaBERT: A State-of-the-Art BERT Suite for Modern Hebrew + + +
+ We present DictaBERT, a new state-of-the-art pre-trained BERT model for +modern Hebrew, outperforming existing models on most benchmarks. Additionally, +we release two fine-tuned versions of the model, designed to perform two +specific foundational tasks in the analysis of Hebrew texts: prefix +segmentation and morphological tagging. These fine-tuned models allow any +developer to perform prefix segmentation and morphological tagging of a Hebrew +sentence with a single call to a HuggingFace model, without the need to +integrate any additional libraries or code. In this paper we describe the +details of the training as well and the results on the different benchmarks. We +release the models to the community, along with sample code demonstrating their +use. We release these models as part of our goal to help further research and +development in Hebrew NLP. + +
+
+
+
+
+ + ☆ Developing a Scalable Benchmark for Assessing Large Language Models in + Knowledge Graph Engineering + + +
+ As the field of Large Language Models (LLMs) evolves at an accelerated pace, +the critical need to assess and monitor their performance emerges. We introduce +a benchmarking framework focused on knowledge graph engineering (KGE) +accompanied by three challenges addressing syntax and error correction, facts +extraction and dataset generation. We show that while being a useful tool, LLMs +are yet unfit to assist in knowledge graph generation with zero-shot prompting. +Consequently, our LLM-KG-Bench framework provides automatic evaluation and +storage of LLM responses as well as statistical data and visualization tools to +support tracking of prompt engineering and model performance. + +
+
+ comment: To be published in SEMANTICS 2023 poster track proceedings. SEMANTICS + 2023 EU: 19th International Conference on Semantic Systems, September 20-22, + 2023, Leipzig, Germany +
+
+
+
+
+ + ☆ Towards Spontaneous Style Modeling with Semi-supervised Pre-training for + Conversational Text-to-Speech Synthesis INTERSPEECH 2023 + + +
+ The spontaneous behavior that often occurs in conversations makes speech more +human-like compared to reading-style. However, synthesizing spontaneous-style +speech is challenging due to the lack of high-quality spontaneous datasets and +the high cost of labeling spontaneous behavior. In this paper, we propose a +semi-supervised pre-training method to increase the amount of spontaneous-style +speech and spontaneous behavioral labels. In the process of semi-supervised +learning, both text and speech information are considered for detecting +spontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is +used to model the relationship between each sentence in the conversation. +Experimental results indicate that our proposed method achieves superior +expressive speech synthesis performance with the ability to model spontaneous +behavior in spontaneous-style speech and predict reasonable spontaneous +behavior from text. + +
+
+ comment: Accepted by INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Interpreting Sentiment Composition with Latent Semantic Tree ACL2023 + + +
+ As the key to sentiment analysis, sentiment composition considers the +classification of a constituent via classifications of its contained +sub-constituents and rules operated on them. Such compositionality has been +widely studied previously in the form of hierarchical trees including untagged +and sentiment ones, which are intrinsically suboptimal in our view. To address +this, we propose semantic tree, a new tree form capable of interpreting the +sentiment composition in a principled way. Semantic tree is a derivation of a +context-free grammar (CFG) describing the specific composition rules on +difference semantic roles, which is designed carefully following previous +linguistic conclusions. However, semantic tree is a latent variable since there +is no its annotation in regular datasets. Thus, in our method, it is +marginalized out via inside algorithm and learned to optimize the +classification performance. Quantitative and qualitative results demonstrate +that our method not only achieves better or competitive results compared to +baselines in the setting of regular and domain adaptation classification, and +also generates plausible tree explanations. + +
+
+ comment: Findings of ACL2023 +
+
+
+
+
+ + ☆ Unsupervised Text Style Transfer with Deep Generative Models + + +
+ We present a general framework for unsupervised text style transfer with deep +generative models. The framework models each sentence-label pair in the +non-parallel corpus as partially observed from a complete quadruplet which +additionally contains two latent codes representing the content and style, +respectively. These codes are learned by exploiting dependencies inside the +observed data. Then a sentence is transferred by manipulating them. Our +framework is able to unify previous embedding and prototype methods as two +special forms. It also provides a principled perspective to explain previously +proposed techniques in the field such as aligned encoder and adversarial +training. We further conduct experiments on three benchmarks. Both automatic +and human evaluation results show that our methods achieve better or +competitive results compared to several strong baselines. + +
+
+
+
+
+ + ☆ Improving Mandarin Prosodic Structure Prediction with Multi-level + Contextual Information + + +
+ For text-to-speech (TTS) synthesis, prosodic structure prediction (PSP) plays +an important role in producing natural and intelligible speech. Although +inter-utterance linguistic information can influence the speech interpretation +of the target utterance, previous works on PSP mainly focus on utilizing +intrautterance linguistic information of the current utterance only. This work +proposes to use inter-utterance linguistic information to improve the +performance of PSP. Multi-level contextual information, which includes both +inter-utterance and intrautterance linguistic information, is extracted by a +hierarchical encoder from character level, utterance level and discourse level +of the input text. Then a multi-task learning (MTL) decoder predicts prosodic +boundaries from multi-level contextual information. Objective evaluation +results on two datasets show that our method achieves better F1 scores in +predicting prosodic word (PW), prosodic phrase (PPH) and intonational phrase +(IPH). It demonstrates the effectiveness of using multi-level contextual +information for PSP. Subjective preference tests also indicate the naturalness +of synthesized speeches are improved. + +
+
+ comment: Accepted by Interspeech2022 +
+
+
+
+
+ + ☆ Thesis Distillation: Investigating The Impact of Bias in NLP Models on + Hate Speech Detection + + +
+ This paper is a summary of the work in my PhD thesis. In which, I investigate +the impact of bias in NLP models on the task of hate speech detection from +three perspectives: explainability, offensive stereotyping bias, and fairness. +I discuss the main takeaways from my thesis and how they can benefit the +broader NLP community. Finally, I discuss important future research directions. +The findings of my thesis suggest that bias in NLP models impacts the task of +hate speech detection from all three perspectives. And that unless we start +incorporating social sciences in studying bias in NLP models, we will not +effectively overcome the current limitations of measuring and mitigating bias +in NLP models. + +
+
+
+
+
+ + ☆ Time-Varying Quasi-Closed-Phase Analysis for Accurate Formant Tracking + in Speech Signals + + +
+ In this paper, we propose a new method for the accurate estimation and +tracking of formants in speech signals using time-varying quasi-closed-phase +(TVQCP) analysis. Conventional formant tracking methods typically adopt a +two-stage estimate-and-track strategy wherein an initial set of formant +candidates are estimated using short-time analysis (e.g., 10--50 ms), followed +by a tracking stage based on dynamic programming or a linear state-space model. +One of the main disadvantages of these approaches is that the tracking stage, +however good it may be, cannot improve upon the formant estimation accuracy of +the first stage. The proposed TVQCP method provides a single-stage formant +tracking that combines the estimation and tracking stages into one. TVQCP +analysis combines three approaches to improve formant estimation and tracking: +(1) it uses temporally weighted quasi-closed-phase analysis to derive +closed-phase estimates of the vocal tract with reduced interference from the +excitation source, (2) it increases the residual sparsity by using the $L_1$ +optimization and (3) it uses time-varying linear prediction analysis over long +time windows (e.g., 100--200 ms) to impose a continuity constraint on the vocal +tract model and hence on the formant trajectories. Formant tracking experiments +with a wide variety of synthetic and natural speech signals show that the +proposed TVQCP method performs better than conventional and popular formant +tracking tools, such as Wavesurfer and Praat (based on dynamic programming), +the KARMA algorithm (based on Kalman filtering), and DeepFormants (based on +deep neural networks trained in a supervised manner). Matlab scripts for the +proposed method can be found at: https://github.com/njaygowda/ftrack + +
+
+
+
+
+ + ☆ The Smart Data Extractor, a Clinician Friendly Solution to Accelerate + and Improve the Data Collection During Clinical Trials + + +
+ In medical research, the traditional way to collect data, i.e. browsing +patient files, has been proven to induce bias, errors, human labor and costs. +We propose a semi-automated system able to extract every type of data, +including notes. The Smart Data Extractor pre-populates clinic research forms +by following rules. We performed a cross-testing experiment to compare +semi-automated to manual data collection. 20 target items had to be collected +for 79 patients. The average time to complete one form was 6'81'' for manual +data collection and 3'22'' with the Smart Data Extractor. There were also more +mistakes during manual data collection (163 for the whole cohort) than with the +Smart Data Extractor (46 for the whole cohort). We present an easy to use, +understandable and agile solution to fill out clinical research forms. It +reduces human effort and provides higher quality data, avoiding data re-entry +and fatigue induced errors. + +
+
+ comment: IOS Press, 2023, Studies in Health Technology and Informatics +
+
+
+
+
+ + ☆ Generalised Winograd Schema and its Contextuality + + +
+ Ambiguities in natural language give rise to probability distributions over +interpretations. The distributions are often over multiple ambiguous words at a +time; a multiplicity which makes them a suitable topic for sheaf-theoretic +models of quantum contextuality. Previous research showed that different +quantitative measures of contextuality correlate well with Psycholinguistic +research on lexical ambiguities. In this work, we focus on coreference +ambiguities and investigate the Winograd Schema Challenge (WSC), a test +proposed by Levesque in 2011 to evaluate the intelligence of machines. The WSC +consists of a collection of multiple-choice questions that require +disambiguating pronouns in sentences structured according to the Winograd +schema, in a way that makes it difficult for machines to determine the correct +referents but remains intuitive for human comprehension. In this study, we +propose an approach that analogously models the Winograd schema as an +experiment in quantum physics. However, we argue that the original Winograd +Schema is inherently too simplistic to facilitate contextuality. We introduce a +novel mechanism for generalising the schema, rendering it analogous to a +Bell-CHSH measurement scenario. We report an instance of this generalised +schema, complemented by the human judgements we gathered via a crowdsourcing +platform. The resulting model violates the Bell-CHSH inequality by 0.192, thus +exhibiting contextuality in a coreference resolution setting. + +
+
+ comment: In Proceedings QPL 2023, arXiv:2308.15489 +
+
+
+
+
+ + ☆ Transformer Compression via Subspace Projection + + +
+ We propose TCSP, a novel method for compressing a transformer model by +focusing on reducing the hidden size of the model. By projecting the whole +transform model into a subspace, we enable matrix operations between the weight +matrices in the model and features in a reduced-dimensional space, leading to +significant reductions in model parameters and computing resources. To +establish this subspace, we decompose the feature matrix, derived from +different layers of sampled data instances, into a projection matrix. For +evaluation, TCSP is applied to compress T5 and BERT models on the GLUE and +SQuAD benchmarks. Experimental results demonstrate that TCSP achieves a +compression ratio of 44\% with at most 1.6\% degradation in accuracy, +surpassing or matching prior compression methods. Furthermore, TCSP exhibits +compatibility with other methods targeting filter and attention head size +compression. + +
+
+ comment: 21 pages, 1 figures +
+
+
+
+
+ + ☆ Enhancing Subtask Performance of Multi-modal Large Language Model + + +
+ Multi-modal Large Language Model (MLLM) refers to a model expanded from a +Large Language Model (LLM) that possesses the capability to handle and infer +multi-modal data. Current MLLMs typically begin by using LLMs to decompose +tasks into multiple subtasks, then employing individual pre-trained models to +complete specific subtasks, and ultimately utilizing LLMs to integrate the +results of each subtasks to obtain the results of the task. In real-world +scenarios, when dealing with large projects, it is common practice to break +down the project into smaller sub-projects, with different teams providing +corresponding solutions or results. The project owner then decides which +solution or result to use, ensuring the best possible outcome for each subtask +and, consequently, for the entire project. Inspired by this, this study +considers selecting multiple pre-trained models to complete the same subtask. +By combining the results from multiple pre-trained models, the optimal subtask +result is obtained, enhancing the performance of the MLLM. Specifically, this +study first selects multiple pre-trained models focused on the same subtask +based on distinct evaluation approaches, and then invokes these models in +parallel to process input data and generate corresponding subtask results. +Finally, the results from multiple pre-trained models for the same subtask are +compared using the LLM, and the best result is chosen as the outcome for that +subtask. Extensive experiments are conducted in this study using GPT-4 +annotated datasets and human-annotated datasets. The results of various +evaluation metrics adequately demonstrate the effectiveness of the proposed +approach in this paper. + +
+
+
+
+
+ + ☆ Link Prediction for Wikipedia Articles as a Natural Language Inference + Task + + +
+ Link prediction task is vital to automatically understanding the structure of +large knowledge bases. In this paper, we present our system to solve this task +at the Data Science and Advanced Analytics 2023 Competition "Efficient and +Effective Link Prediction" (DSAA-2023 Competition) with a corpus containing +948,233 training and 238,265 for public testing. This paper introduces an +approach to link prediction in Wikipedia articles by formulating it as a +natural language inference (NLI) task. Drawing inspiration from recent +advancements in natural language processing and understanding, we cast link +prediction as an NLI task, wherein the presence of a link between two articles +is treated as a premise, and the task is to determine whether this premise +holds based on the information presented in the articles. We implemented our +system based on the Sentence Pair Classification for Link Prediction for the +Wikipedia Articles task. Our system achieved 0.99996 Macro F1-score and 1.00000 +Macro F1-score for the public and private test sets, respectively. Our team +UIT-NLP ranked 3rd in performance on the private test set, equal to the scores +of the first and second places. Our code is publicly for research purposes. + +
+
+ comment: Accepted at the 10th IEEE International Conference On Data Science + And Advanced Analytics (DSAA 2023) +
+
+
+
+
+ + ☆ Sparkles: Unlocking Chats Across Multiple Images for Multimodal + Instruction-Following Models + + +
+ Large language models exhibit enhanced zero-shot performance on various tasks +when fine-tuned with instruction-following data. Multimodal +instruction-following models extend these capabilities by integrating both text +and images. However, existing models such as MiniGPT-4 face challenges in +maintaining dialogue coherence in scenarios involving multiple images. A +primary reason is the lack of a specialized dataset for this critical +application. To bridge these gaps, we present SparklesChat, a multimodal +instruction-following model for open-ended dialogues across multiple images. To +support the training, we introduce SparklesDialogue, the first +machine-generated dialogue dataset tailored for word-level interleaved +multi-image and text interactions. Furthermore, we construct SparklesEval, a +GPT-assisted benchmark for quantitatively assessing a model's conversational +competence across multiple images and dialogue turns. Our experiments validate +the effectiveness of SparklesChat in understanding and reasoning across +multiple images and dialogue turns. Specifically, SparklesChat outperformed +MiniGPT-4 on established vision-and-language benchmarks, including the BISON +binary image selection task and the NLVR2 visual reasoning task. Moreover, +SparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding +MiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative +evaluations further demonstrate SparklesChat's generality in handling +real-world applications. All resources will be available at +https://github.com/HYPJUDY/Sparkles. + +
+
+
+
+
+ + ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained language models like ChatGPT have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks. Moreover, in bioinformatics, generating +functional programs poses additional notable challenges due to the amount of +domain knowledge, the need for complicated data operations, and intricate +functional dependencies between the operations. Here, we present BioCoder, a +benchmark developed to evaluate existing pre-trained models in generating +bioinformatics code. In relation to function-code generation, BioCoder covers +potential package dependencies, class declarations, and global variables. It +incorporates 1026 functions and 1243 methods in Python and Java from GitHub and +253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing +framework for evaluation, and we have applied it to evaluate many models +including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, +InstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes +the importance of domain knowledge, pragmatic code generation, and contextual +understanding. Our dataset, benchmark, Docker images, and scripts required for +testing are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ☆ Knowledge Distillation from Non-streaming to Streaming ASR Encoder using + Auxiliary Non-streaming Layer + + +
+ Streaming automatic speech recognition (ASR) models are restricted from +accessing future context, which results in worse performance compared to the +non-streaming models. To improve the performance of streaming ASR, knowledge +distillation (KD) from the non-streaming to streaming model has been studied, +mainly focusing on aligning the output token probabilities. In this paper, we +propose a layer-to-layer KD from the teacher encoder to the student encoder. To +ensure that features are extracted using the same context, we insert auxiliary +non-streaming branches to the student and perform KD from the non-streaming +teacher layer to the non-streaming auxiliary layer. We design a special KD loss +that leverages the autoregressive predictive coding (APC) mechanism to +encourage the streaming model to predict unseen future contexts. Experimental +results show that the proposed method can significantly reduce the word error +rate compared to previous token probability distillation methods. + +
+
+ comment: Accepted to Interspeech 2023 +
+
+
+
+
+ + ☆ LLM in the Shell: Generative Honeypots + + +
+ Honeypots are essential tools in cybersecurity. However, most of them (even +the high-interaction ones) lack the required realism to engage and fool human +attackers. This limitation makes them easily discernible, hindering their +effectiveness. This work introduces a novel method to create dynamic and +realistic software honeypots based on Large Language Models. Preliminary +results indicate that LLMs can create credible and dynamic honeypots capable of +addressing important limitations of previous honeypots, such as deterministic +responses, lack of adaptability, etc. We evaluated the realism of each command +by conducting an experiment with human attackers who needed to say if the +answer from the honeypot was fake or not. Our proposed honeypot, called shelLM, +reached an accuracy rate of 0.92. + +
+
+ comment: 5 pages. 1 figure 1 table +
+
+
+
+
+ + ☆ Construction Grammar and Artificial Intelligence + + +
+ In this chapter, we argue that it is highly beneficial for the contemporary +construction grammarian to have a thorough understanding of the strong +relationship between the research fields of construction grammar and artificial +intelligence. We start by unravelling the historical links between the two +fields, showing that their relationship is rooted in a common attitude towards +human communication and language. We then discuss the first direction of +influence, focussing in particular on how insights and techniques from the +field of artificial intelligence play an important role in operationalising, +validating and scaling constructionist approaches to language. We then proceed +to the second direction of influence, highlighting the relevance of +construction grammar insights and analyses to the artificial intelligence +endeavour of building truly intelligent agents. We support our case with a +variety of illustrative examples and conclude that the further elaboration of +this relationship will play a key role in shaping the future of the field of +construction grammar. + +
+
+ comment: Peer-reviewed author's draft of a chapter to appear in the Cambridge + Handbook of Construction Grammar (2024 - edited by Mirjam Fried and Kiki + Nikiforidou) +
+
+
+
+
+ + ☆ QS-TTS: Towards Semi-Supervised Text-to-Speech Synthesis via + Vector-Quantized Self-Supervised Speech Representation Learning + + +
+ This paper proposes a novel semi-supervised TTS framework, QS-TTS, to improve +TTS quality with lower supervised data requirements via Vector-Quantized +Self-Supervised Speech Representation Learning (VQ-S3RL) utilizing more +unlabeled speech audio. This framework comprises two VQ-S3R learners: first, +the principal learner aims to provide a generative Multi-Stage Multi-Codebook +(MSMC) VQ-S3R via the MSMC-VQ-GAN combined with the contrastive S3RL, while +decoding it back to the high-quality audio; then, the associate learner further +abstracts the MSMC representation into a highly-compact VQ representation +through a VQ-VAE. These two generative VQ-S3R learners provide profitable +speech representations and pre-trained models for TTS, significantly improving +synthesis quality with the lower requirement for supervised data. QS-TTS is +evaluated comprehensively under various scenarios via subjective and objective +tests in experiments. The results powerfully demonstrate the superior +performance of QS-TTS, winning the highest MOS over supervised or +semi-supervised baseline TTS approaches, especially in low-resource scenarios. +Moreover, comparing various speech representations and transfer learning +methods in TTS further validates the notable improvement of the proposed +VQ-S3RL to TTS, showing the best audio quality and intelligibility metrics. The +trend of slower decay in the synthesis quality of QS-TTS with decreasing +supervised data further highlights its lower requirements for supervised data, +indicating its great potential in low-resource scenarios. + +
+
+
+
+
+ + ☆ Large language models in medicine: the potentials and pitfalls + + +
+ Large language models (LLMs) have been applied to tasks in healthcare, +ranging from medical exam questions to responding to patient questions. With +increasing institutional partnerships between companies producing LLMs and +healthcare systems, real world clinical application is coming closer to +reality. As these models gain traction, it is essential for healthcare +practitioners to understand what LLMs are, their development, their current and +potential applications, and the associated pitfalls when utilized in medicine. +This review and accompanying tutorial aim to give an overview of these topics +to aid healthcare practitioners in understanding the rapidly changing landscape +of LLMs as applied to medicine. + +
+
+
+
+
+ + ☆ YaRN: Efficient Context Window Extension of Large Language Models + + +
+ Rotary Position Embeddings (RoPE) have been shown to effectively encode +positional information in transformer-based language models. However, these +models fail to generalize past the sequence length they were trained on. We +present YaRN (Yet another RoPE extensioN method), a compute-efficient method to +extend the context window of such models, requiring 10x less tokens and 2.5x +less training steps than previous methods. Using YaRN, we show that LLaMA +models can effectively utilize and extrapolate to context lengths much longer +than their original pre-training would allow, while also surpassing previous +the state-of-the-art at context window extension. In addition, we demonstrate +that YaRN exhibits the capability to extrapolate beyond the limited context of +a fine-tuning dataset. We publish the checkpoints of Llama 2 7B/13B fine-tuned +using YaRN with 64k and 128k context windows at +https://github.com/jquesnelle/yarn + +
+
+
+
+
+ + ♻ ☆ Sensi-BERT: Towards Sensitivity Driven Fine-Tuning for + Parameter-Efficient BERT + + +
+ Large pre-trained language models have recently gained significant traction +due to their improved performance on various down-stream tasks like text +classification and question answering, requiring only few epochs of +fine-tuning. However, their large model sizes often prohibit their applications +on resource-constrained edge devices. Existing solutions of yielding +parameter-efficient BERT models largely rely on compute-exhaustive training and +fine-tuning. Moreover, they often rely on additional compute heavy models to +mitigate the performance gap. In this paper, we present Sensi-BERT, a +sensitivity driven efficient fine-tuning of BERT models that can take an +off-the-shelf pre-trained BERT model and yield highly parameter-efficient +models for downstream tasks. In particular, we perform sensitivity analysis to +rank each individual parameter tensor, that then is used to trim them +accordingly during fine-tuning for a given parameter or FLOPs budget. Our +experiments show the efficacy of Sensi-BERT across different downstream tasks +including MNLI, QQP, QNLI, SST-2 and SQuAD, showing better performance at +similar or smaller parameter budget compared to various alternatives. + +
+
+ comment: 6 pages, 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Deanthropomorphising NLP: Can a Language Model Be Conscious? + + +
+ This work is intended as a voice in the discussion over previous claims that +a pretrained large language model (LLM) based on the Transformer model +architecture can be sentient. Such claims have been made concerning the LaMDA +model and also concerning the current wave of LLM-powered chatbots, such as +ChatGPT. This claim, if confirmed, would have serious ramifications in the +Natural Language Processing (NLP) community due to wide-spread use of similar +models. However, here we take the position that such a large language model +cannot be sentient, or conscious, and that LaMDA in particular exhibits no +advances over other similar models that would qualify it. We justify this by +analysing the Transformer architecture through Integrated Information Theory of +consciousness. We see the claims of sentience as part of a wider tendency to +use anthropomorphic language in NLP reporting. Regardless of the veracity of +the claims, we consider this an opportune moment to take stock of progress in +language modelling and consider the ethical implications of the task. In order +to make this work helpful for readers outside the NLP community, we also +present the necessary background in language modelling. + +
+
+
+
+
+ + ♻ ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks, such as Reddit discussions. In contrast to traditional comment-only +methods, our approach to labelling a comment as hate speech involves a holistic +analysis of text and images grounded in the discussion context. This is done by +leveraging graph transformers to capture the contextual relationships in the +entire discussion surrounding a comment and grounding the interwoven fusion +layers that combine individual comments' text and image embeddings instead of +processing modalities separately. We compare the performance of our model to +baselines that only process individual comments and conduct extensive ablation +studies. To evaluate our work, we present a new dataset, HatefulDiscussions, +comprising complete multi-modal discussions from multiple online communities on +Reddit. We conclude with future work for multimodal solutions to deliver social +value in online contexts, arguing that capturing a holistic view of a +conversation significantly advances the effort to detect anti-social behaviour. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ♻ ☆ "It Felt Like Having a Second Mind": Investigating Human-AI + Co-creativity in Prewriting with Large Language Models + + +
+ Prewriting is the process of discovering and developing ideas before a first +draft, which requires divergent thinking and often implies unstructured +strategies such as diagramming, outlining, free-writing, etc. Although large +language models (LLMs) have been demonstrated to be useful for a variety of +tasks including creative writing, little is known about how users would +collaborate with LLMs to support prewriting. The preferred collaborative role +and initiative of LLMs during such a creativity process is also unclear. To +investigate human-LLM collaboration patterns and dynamics during prewriting, we +conducted a three-session qualitative study with 15 participants in two +creative tasks: story writing and slogan writing. The findings indicated that +during collaborative prewriting, there appears to be a three-stage iterative +Human-AI Co-creativity process that includes Ideation, Illumination, and +Implementation stages. This collaborative process champions the human in a +dominant role, in addition to mixed and shifting levels of initiative that +exist between humans and LLMs. This research also reports on collaboration +breakdowns that occur during this process, user perceptions of using existing +LLMs during Human-AI Co-creativity, and discusses design implications to +support this co-creativity process. + +
+
+ comment: Under Review; 25 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Playing with Words: Comparing the Vocabulary and Lexical Richness of + ChatGPT and Humans + + +
+ The introduction of Artificial Intelligence (AI) generative language models +such as GPT (Generative Pre-trained Transformer) and tools such as ChatGPT has +triggered a revolution that can transform how text is generated. This has many +implications, for example, as AI-generated text becomes a significant fraction +of the text, would this have an effect on the language capabilities of readers +and also on the training of newer AI tools? Would it affect the evolution of +languages? Focusing on one specific aspect of the language: words; will the use +of tools such as ChatGPT increase or reduce the vocabulary used or the lexical +richness? This has implications for words, as those not included in +AI-generated content will tend to be less and less popular and may eventually +be lost. In this work, we perform an initial comparison of the vocabulary and +lexical richness of ChatGPT and humans when performing the same tasks. In more +detail, two datasets containing the answers to different types of questions +answered by ChatGPT and humans, and a third dataset in which ChatGPT +paraphrases sentences and questions are used. The analysis shows that ChatGPT +tends to use fewer distinct words and lower lexical richness than humans. These +results are very preliminary and additional datasets and ChatGPT configurations +have to be evaluated to extract more general conclusions. Therefore, further +research is needed to understand how the use of ChatGPT and more broadly +generative AI tools will affect the vocabulary and lexical richness in +different types of text and languages. + +
+
+
+
+
+ + ♻ ☆ CARE-MI: Chinese Benchmark for Misinformation Evaluation in Maternity + and Infant Care + + +
+ The recent advances in natural language processing (NLP), have led to a new +trend of applying large language models (LLMs) to real-world scenarios. While +the latest LLMs are astonishingly fluent when interacting with humans, they +suffer from the misinformation problem by unintentionally generating factually +false statements. This can lead to harmful consequences, especially when +produced within sensitive contexts, such as healthcare. Yet few previous works +have focused on evaluating misinformation in the long-form (LF) generation of +LLMs, especially for knowledge-intensive topics. Moreover, although LLMs have +been shown to perform well in different languages, misinformation evaluation +has been mostly conducted in English. To this end, we present a benchmark, +CARE-MI, for evaluating LLM misinformation in: 1) a sensitive topic, +specifically the maternity and infant care domain; and 2) a language other than +English, namely Chinese. Most importantly, we provide an innovative paradigm +for building LF generation evaluation benchmarks that can be transferred to +other knowledge-intensive domains and low-resourced languages. Our proposed +benchmark fills the gap between the extensive usage of LLMs and the lack of +datasets for assessing the misinformation generated by these models. It +contains 1,612 expert-checked questions, accompanied with human-selected +references. Using our benchmark, we conduct extensive experiments and found +that current Chinese LLMs are far from perfect in the topic of maternity and +infant care. In an effort to minimize the reliance on human resources for +performance evaluation, we offer off-the-shelf judgment models for +automatically assessing the LF output of LLMs given benchmark questions. +Moreover, we compare potential solutions for LF generation evaluation and +provide insights for building better automated metrics. + +
+
+
+
+
+ + ♻ ☆ DocPrompt: Large-scale continue pretrain for zero-shot and few-shot + document question answering + + +
+ In this paper, we propose Docprompt for document question answering tasks +with powerful zero-shot and few-shot performance. We proposed a novel weakly +supervised data generation method, a novel multl-stage training method and a +novel understanding model \& generation model ensemble method. We achieved +state-of-the-art performance on 4 document question answering tasks. This +method greatly improves the delivery efficiency and model performance of +document question answering customer projects, reducing annotation costs and +labor costs. Our demo can be found at +https://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout. + +
+
+
+
+
+ + ♻ ☆ Exploring Large Language Models for Knowledge Graph Completion + + +
+ Knowledge graphs play a vital role in numerous artificial intelligence tasks, +yet they frequently face the issue of incompleteness. In this study, we explore +utilizing Large Language Models (LLM) for knowledge graph completion. We +consider triples in knowledge graphs as text sequences and introduce an +innovative framework called Knowledge Graph LLM (KG-LLM) to model these +triples. Our technique employs entity and relation descriptions of a triple as +prompts and utilizes the response for predictions. Experiments on various +benchmark knowledge graphs demonstrate that our method attains state-of-the-art +performance in tasks such as triple classification and relation prediction. We +also find that fine-tuning relatively smaller models (e.g., LLaMA-7B, +ChatGLM-6B) outperforms recent ChatGPT and GPT-4. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ OLISIA: a Cascade System for Spoken Dialogue State Tracking + + +
+ Though Dialogue State Tracking (DST) is a core component of spoken dialogue +systems, recent work on this task mostly deals with chat corpora, disregarding +the discrepancies between spoken and written language.In this paper, we propose +OLISIA, a cascade system which integrates an Automatic Speech Recognition (ASR) +model and a DST model. We introduce several adaptations in the ASR and DST +modules to improve integration and robustness to spoken conversations.With +these adaptations, our system ranked first in DSTC11 Track 3, a benchmark to +evaluate spoken DST. We conduct an in-depth analysis of the results and find +that normalizing the ASR outputs and adapting the DST inputs through data +augmentation, along with increasing the pre-trained models size all play an +important role in reducing the performance discrepancy between written and +spoken conversations. + +
+
+
+
+
+ + ♻ ☆ Improving Non-autoregressive Translation Quality with Pretrained + Language Model, Embedding Distillation and Upsampling Strategy for CTC + + +
+ Non-autoregressive approaches aim to improve the inference speed of +translation models, particularly those that generate output in a one-pass +forward manner. However, these approaches often suffer from a significant drop +in translation quality compared to autoregressive models. This paper introduces +a series of innovative techniques to enhance the translation quality of +Non-Autoregressive Translation (NAT) models while maintaining a substantial +acceleration in inference speed. We propose fine-tuning Pretrained Multilingual +Language Models (PMLMs) with the CTC loss to train NAT models effectively. +Furthermore, we adopt the MASK insertion scheme for up-sampling instead of +token duplication, and we present an embedding distillation method to further +enhance performance. In our experiments, our model outperforms the baseline +autoregressive model (Transformer \textit{base}) on multiple datasets, +including WMT'14 DE$\leftrightarrow$EN, WMT'16 RO$\leftrightarrow$EN, and +IWSLT'14 DE$\leftrightarrow$EN. Notably, our model achieves better performance +than the baseline autoregressive model on the IWSLT'14 En$\leftrightarrow$De +and WMT'16 En$\leftrightarrow$Ro datasets, even without using distillation data +during training. It is worth highlighting that on the IWSLT'14 +DE$\rightarrow$EN dataset, our model achieves an impressive BLEU score of +39.59, setting a new state-of-the-art performance. Additionally, our model +exhibits a remarkable speed improvement of 16.35 times compared to the +autoregressive model. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue + Evaluation ACL2023 + + +
+ Existing reference-free turn-level evaluation metrics for chatbots +inadequately capture the interaction between the user and the system. +Consequently, they often correlate poorly with human evaluations. To address +this issue, we propose a novel model-agnostic approach that leverages +Conditional Pointwise Mutual Information (C-PMI) to measure the turn-level +interaction between the system and the user based on a given evaluation +dimension. Experimental results on the widely used FED dialogue evaluation +dataset demonstrate that our approach significantly improves the correlation +with human judgment compared with existing evaluation systems. By replacing the +negative log-likelihood-based scorer with our proposed C-PMI scorer, we achieve +a relative 60.5% higher Spearman correlation on average for the FED evaluation +metric. Our code is publicly available at https://github.com/renll/C-PMI. + +
+
+ comment: Published at ACL2023 DialDoc Workshop; Updated Results +
+
+
+
+
+ + ♻ ☆ Is the U.S. Legal System Ready for AI's Challenges to Human Values? + + +
+ Our interdisciplinary study investigates how effectively U.S. laws confront +the challenges posed by Generative AI to human values. Through an analysis of +diverse hypothetical scenarios crafted during an expert workshop, we have +identified notable gaps and uncertainties within the existing legal framework +regarding the protection of fundamental values, such as privacy, autonomy, +dignity, diversity, equity, and physical/mental well-being. Constitutional and +civil rights, it appears, may not provide sufficient protection against +AI-generated discriminatory outputs. Furthermore, even if we exclude the +liability shield provided by Section 230, proving causation for defamation and +product liability claims is a challenging endeavor due to the intricate and +opaque nature of AI systems. To address the unique and unforeseeable threats +posed by Generative AI, we advocate for legal frameworks that evolve to +recognize new threat and provide proactive, auditable guidelines to industry +stakeholders. Addressing these issues requires deep interdisciplinary +collaborations to identify harms, values, and mitigation strategies. + +
+
+ comment: 26 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Foundation Models utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. In this paper, we propose a +new framework that includes the Domain Foundation Model (DFM), bridging the gap +between the General Foundation Model (GFM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +tried several Parameter-Efficient Fine-Tuning methods on RS5M to implement the +DFM. Experimental results show that our proposed dataset are highly effective +for various tasks, improving upon the baseline by $8 \% \sim 16 \%$ in +zero-shot classification tasks, and obtaining good results in both +Vision-Language Retrieval and Semantic Localization tasks. +\url{https://github.com/om-ai-lab/RS5M} + +
+
+ comment: RS5M dataset v4 +
+
+
+
+
+ + ♻ ☆ Unveiling Gender Bias in Terms of Profession Across LLMs: Analyzing and + Addressing Sociological Implications + + +
+ Gender bias in artificial intelligence (AI) and natural language processing +has garnered significant attention due to its potential impact on societal +perceptions and biases. This research paper aims to analyze gender bias in +Large Language Models (LLMs) with a focus on multiple comparisons between GPT-2 +and GPT-3.5, some prominent language models, to better understand its +implications. Through a comprehensive literature review, the study examines +existing research on gender bias in AI language models and identifies gaps in +the current knowledge. The methodology involves collecting and preprocessing +data from GPT-2 and GPT-3.5, and employing in-depth quantitative analysis +techniques to evaluate gender bias in the generated text. The findings shed +light on gendered word associations, language usage, and biased narratives +present in the outputs of these Large Language Models. The discussion explores +the ethical implications of gender bias and its potential consequences on +social perceptions and marginalized communities. Additionally, the paper +presents strategies for reducing gender bias in LLMs, including algorithmic +approaches and data augmentation techniques. The research highlights the +importance of interdisciplinary collaborations and the role of sociological +studies in mitigating gender bias in AI models. By addressing these issues, we +can pave the way for more inclusive and unbiased AI systems that have a +positive impact on society. + +
+
+
+
+
+ + ♻ ☆ AspectCSE: Sentence Embeddings for Aspect-based Semantic Textual + Similarity Using Contrastive Learning and Structured Knowledge + + +
+ Generic sentence embeddings provide a coarse-grained approximation of +semantic textual similarity but ignore specific aspects that make texts +similar. Conversely, aspect-based sentence embeddings provide similarities +between texts based on certain predefined aspects. Thus, similarity predictions +of texts are more targeted to specific requirements and more easily +explainable. In this paper, we present AspectCSE, an approach for aspect-based +contrastive learning of sentence embeddings. Results indicate that AspectCSE +achieves an average improvement of 3.97% on information retrieval tasks across +multiple aspects compared to the previous best results. We also propose using +Wikidata knowledge graph properties to train models of multi-aspect sentence +embeddings in which multiple specific aspects are simultaneously considered +during similarity predictions. We demonstrate that multi-aspect embeddings +outperform single-aspect embeddings on aspect-specific information retrieval +tasks. Finally, we examine the aspect-based sentence embedding space and +demonstrate that embeddings of semantically similar aspect labels are often +close, even without explicit similarity training between different aspect +labels. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Efficient Benchmarking (of Language Models) + + +
+ The increasing versatility of language models LMs has given rise to a new +class of benchmarks that comprehensively assess a broad range of capabilities. +Such benchmarks are associated with massive computational costs reaching +thousands of GPU hours per model. However the efficiency aspect of these +evaluation efforts had raised little discussion in the literature. In this work +we present the problem of Efficient Benchmarking namely intelligently reducing +the computation costs of LM evaluation without compromising reliability. Using +the HELM benchmark as a test case we investigate how different benchmark design +choices affect the computation-reliability tradeoff. We propose to evaluate the +reliability of such decisions by using a new measure Decision Impact on +Reliability DIoR for short. We find for example that the current leader on HELM +may change by merely removing a low-ranked model from the benchmark and observe +that a handful of examples suffice to obtain the correct benchmark ranking. +Conversely a slightly different choice of HELM scenarios varies ranking widely. +Based on our findings we outline a set of concrete recommendations for more +efficient benchmark design and utilization practices leading to dramatic cost +savings with minimal loss of benchmark reliability often reducing computation +by x100 or more. + +
+
+
+
+
+ + ♻ ☆ Prompting GPT-3.5 for Text-to-SQL with De-semanticization and Skeleton + Retrieval + + +
+ Text-to-SQL is a task that converts a natural language question into a +structured query language (SQL) to retrieve information from a database. Large +language models (LLMs) work well in natural language generation tasks, but they +are not specifically pre-trained to understand the syntax and semantics of SQL +commands. In this paper, we propose an LLM-based framework for Text-to-SQL +which retrieves helpful demonstration examples to prompt LLMs. However, +questions with different database schemes can vary widely, even if the +intentions behind them are similar and the corresponding SQL queries exhibit +similarities. Consequently, it becomes crucial to identify the appropriate SQL +demonstrations that align with our requirements. We design a de-semanticization +mechanism that extracts question skeletons, allowing us to retrieve similar +examples based on their structural similarity. We also model the relationships +between question tokens and database schema items (i.e., tables and columns) to +filter out scheme-related information. Our framework adapts the range of the +database schema in prompts to balance length and valuable information. A +fallback mechanism allows for a more detailed schema to be provided if the +generated SQL query fails. Ours outperforms state-of-the-art models and +demonstrates strong generalization ability on three cross-domain Text-to-SQL +benchmarks. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 146 + +
+
+
+ + ☆ PointLLM: Empowering Large Language Models to Understand Point Clouds + + +
+ The unprecedented advancements in Large Language Models (LLMs) have created a +profound impact on natural language processing but are yet to fully embrace the +realm of 3D understanding. This paper introduces PointLLM, a preliminary effort +to fill this gap, thereby enabling LLMs to understand point clouds and offering +a new avenue beyond 2D visual data. PointLLM processes colored object point +clouds with human instructions and generates contextually appropriate +responses, illustrating its grasp of point clouds and common sense. +Specifically, it leverages a point cloud encoder with a powerful LLM to +effectively fuse geometric, appearance, and linguistic information. We collect +a novel dataset comprising 660K simple and 70K complex point-text instruction +pairs to enable a two-stage training strategy: initially aligning latent spaces +and subsequently instruction-tuning the unified model. To rigorously evaluate +our model's perceptual abilities and its generalization capabilities, we +establish two benchmarks: Generative 3D Object Classification and 3D Object +Captioning, assessed through three different methods, including human +evaluation, GPT-4/ChatGPT evaluation, and traditional metrics. Experiment +results show that PointLLM demonstrates superior performance over existing 2D +baselines. Remarkably, in human-evaluated object captioning tasks, PointLLM +outperforms human annotators in over 50% of the samples. Codes, datasets, and +benchmarks are available at https://github.com/OpenRobotLab/PointLLM . + +
+
+ comment: 19 pages. Empowering large language models with 3D point cloud + understanding, accompanied by a novel dataset and carefully designed + benchmarks. Project page: https://runsenxu.com/projects/PointLLM +
+
+
+
+
+ + ☆ StyleInV: A Temporal Style Modulated Inversion Network for Unconditional + Video Generation ICCV 2023 + + +
+ Unconditional video generation is a challenging task that involves +synthesizing high-quality videos that are both coherent and of extended +duration. To address this challenge, researchers have used pretrained StyleGAN +image generators for high-quality frame synthesis and focused on motion +generator design. The motion generator is trained in an autoregressive manner +using heavy 3D convolutional discriminators to ensure motion coherence during +video generation. In this paper, we introduce a novel motion generator design +that uses a learning-based inversion network for GAN. The encoder in our method +captures rich and smooth priors from encoding images to latents, and given the +latent of an initially generated frame as guidance, our method can generate +smooth future latent by modulating the inversion encoder temporally. Our method +enjoys the advantage of sparse training and naturally constrains the generation +space of our motion generator with the inversion network guided by the initial +frame, eliminating the need for heavy discriminators. Moreover, our method +supports style transfer with simple fine-tuning when the encoder is paired with +a pretrained StyleGAN generator. Extensive experiments conducted on various +benchmarks demonstrate the superiority of our method in generating long and +high-resolution videos with decent single-frame quality and temporal +consistency. + +
+
+ comment: ICCV 2023. Code: https://github.com/johannwyh/StyleInV Project page: + https://www.mmlab-ntu.com/project/styleinv/index.html +
+
+
+
+
+ + ☆ Fine-Grained Cross-View Geo-Localization Using a Correlation-Aware + Homography Estimator + + +
+ In this paper, we introduce a novel approach to fine-grained cross-view +geo-localization. Our method aligns a warped ground image with a corresponding +GPS-tagged satellite image covering the same area using homography estimation. +We first employ a differentiable spherical transform, adhering to geometric +principles, to accurately align the perspective of the ground image with the +satellite map. This transformation effectively places ground and aerial images +in the same view and on the same plane, reducing the task to an image alignment +problem. To address challenges such as occlusion, small overlapping range, and +seasonal variations, we propose a robust correlation-aware homography estimator +to align similar parts of the transformed ground image with the satellite +image. Our method achieves sub-pixel resolution and meter-level GPS accuracy by +mapping the center point of the transformed ground image to the satellite image +using a homography matrix and determining the orientation of the ground camera +using a point above the central axis. Operating at a speed of 30 FPS, our +method outperforms state-of-the-art techniques, reducing the mean metric +localization error by 21.3% and 32.4% in same-area and cross-area +generalization tasks on the VIGOR benchmark, respectively, and by 34.4% on the +KITTI benchmark in same-area evaluation. + +
+
+ comment: 19 pages. Reducing the cross-view geo-localization problem to a 2D + image alignment problem by utilizing BEV transformation, and completing the + alignment process with a correlation-aware homography estimator. Code: + https://github.com/xlwangDev/HC-Net +
+
+
+
+
+ + ☆ InterDiff: Generating 3D Human-Object Interactions with Physics-Informed + Diffusion ICCV 2023 + + +
+ This paper addresses a novel task of anticipating 3D human-object +interactions (HOIs). Most existing research on HOI synthesis lacks +comprehensive whole-body interactions with dynamic objects, e.g., often limited +to manipulating small or static objects. Our task is significantly more +challenging, as it requires modeling dynamic objects with various shapes, +capturing whole-body motion, and ensuring physically valid interactions. To +this end, we propose InterDiff, a framework comprising two key steps: (i) +interaction diffusion, where we leverage a diffusion model to encode the +distribution of future human-object interactions; (ii) interaction correction, +where we introduce a physics-informed predictor to correct denoised HOIs in a +diffusion step. Our key insight is to inject prior knowledge that the +interactions under reference with respect to contact points follow a simple +pattern and are easily predictable. Experiments on multiple human-object +interaction datasets demonstrate the effectiveness of our method for this task, +capable of producing realistic, vivid, and remarkably long-term 3D HOI +predictions. + +
+
+ comment: ICCV 2023; Project Page: https://sirui-xu.github.io/InterDiff/ +
+
+
+
+
+ + ☆ PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic + Occupancy Prediction + + +
+ Semantic segmentation in autonomous driving has been undergoing an evolution +from sparse point segmentation to dense voxel segmentation, where the objective +is to predict the semantic occupancy of each voxel in the concerned 3D space. +The dense nature of the prediction space has rendered existing efficient +2D-projection-based methods (e.g., bird's eye view, range view, etc.) +ineffective, as they can only describe a subspace of the 3D scene. To address +this, we propose a cylindrical tri-perspective view to represent point clouds +effectively and comprehensively and a PointOcc model to process them +efficiently. Considering the distance distribution of LiDAR point clouds, we +construct the tri-perspective view in the cylindrical coordinate system for +more fine-grained modeling of nearer areas. We employ spatial group pooling to +maintain structural details during projection and adopt 2D backbones to +efficiently process each TPV plane. Finally, we obtain the features of each +point by aggregating its projected features on each of the processed TPV planes +without the need for any post-processing. Extensive experiments on both 3D +occupancy prediction and LiDAR segmentation benchmarks demonstrate that the +proposed PointOcc achieves state-of-the-art performance with much faster speed. +Specifically, despite only using LiDAR, PointOcc significantly outperforms all +other methods, including multi-modal methods, with a large margin on the +OpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc. + +
+
+ comment: Code is available at https://github.com/wzzheng/PointOcc +
+
+
+
+
+ + ☆ EMDB: The Electromagnetic Database of Global 3D Human Pose and Shape in + the Wild ICCV 2023 + + +
+ We present EMDB, the Electromagnetic Database of Global 3D Human Pose and +Shape in the Wild. EMDB is a novel dataset that contains high-quality 3D SMPL +pose and shape parameters with global body and camera trajectories for +in-the-wild videos. We use body-worn, wireless electromagnetic (EM) sensors and +a hand-held iPhone to record a total of 58 minutes of motion data, distributed +over 81 indoor and outdoor sequences and 10 participants. Together with +accurate body poses and shapes, we also provide global camera poses and body +root trajectories. To construct EMDB, we propose a multi-stage optimization +procedure, which first fits SMPL to the 6-DoF EM measurements and then refines +the poses via image observations. To achieve high-quality results, we leverage +a neural implicit avatar model to reconstruct detailed human surface geometry +and appearance, which allows for improved alignment and smoothness via a dense +pixel-level objective. Our evaluations, conducted with a multi-view volumetric +capture system, indicate that EMDB has an expected accuracy of 2.3 cm +positional and 10.6 degrees angular error, surpassing the accuracy of previous +in-the-wild datasets. We evaluate existing state-of-the-art monocular RGB +methods for camera-relative and global pose estimation on EMDB. EMDB is +publicly available under https://ait.ethz.ch/emdb + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Language-Conditioned Path Planning + + +
+ Contact is at the core of robotic manipulation. At times, it is desired (e.g. +manipulation and grasping), and at times, it is harmful (e.g. when avoiding +obstacles). However, traditional path planning algorithms focus solely on +collision-free paths, limiting their applicability in contact-rich tasks. To +address this limitation, we propose the domain of Language-Conditioned Path +Planning, where contact-awareness is incorporated into the path planning +problem. As a first step in this domain, we propose Language-Conditioned +Collision Functions (LACO) a novel approach that learns a collision function +using only a single-view image, language prompt, and robot configuration. LACO +predicts collisions between the robot and the environment, enabling flexible, +conditional path planning without the need for manual object annotations, point +cloud data, or ground-truth object meshes. In both simulation and the real +world, we demonstrate that LACO can facilitate complex, nuanced path plans that +allow for interaction with objects that are safe to collide, rather than +prohibiting any collision. + +
+
+ comment: Conference on Robot Learning, 2023 +
+
+
+
+
+ + ☆ GNFactor: Multi-Task Real Robot Learning with Generalizable Neural + Feature Fields + + +
+ It is a long-standing problem in robotics to develop agents capable of +executing diverse manipulation tasks from visual observations in unstructured +real-world environments. To achieve this goal, the robot needs to have a +comprehensive understanding of the 3D structure and semantics of the scene. In +this work, we present $\textbf{GNFactor}$, a visual behavior cloning agent for +multi-task robotic manipulation with $\textbf{G}$eneralizable $\textbf{N}$eural +feature $\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural +field (GNF) as a reconstruction module and a Perceiver Transformer as a +decision-making module, leveraging a shared deep 3D voxel representation. To +incorporate semantics in 3D, the reconstruction module utilizes a +vision-language foundation model ($\textit{e.g.}$, Stable Diffusion) to distill +rich semantic information into the deep 3D voxel. We evaluate GNFactor on 3 +real robot tasks and perform detailed ablations on 10 RLBench tasks with a +limited number of demonstrations. We observe a substantial improvement of +GNFactor over current state-of-the-art methods in seen and unseen tasks, +demonstrating the strong generalization ability of GNFactor. Our project +website is https://yanjieze.com/GNFactor/ . + +
+
+ comment: CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/ +
+
+
+
+
+ + ☆ TouchStone: Evaluating Vision-Language Models by Language Models + + +
+ Large vision-language models (LVLMs) have recently witnessed rapid +advancements, exhibiting a remarkable capacity for perceiving, understanding, +and processing visual information by connecting visual receptor with large +language models (LLMs). However, current assessments mainly focus on +recognizing and reasoning abilities, lacking direct evaluation of +conversational skills and neglecting visual storytelling abilities. In this +paper, we propose an evaluation method that uses strong LLMs as judges to +comprehensively evaluate the various abilities of LVLMs. Firstly, we construct +a comprehensive visual dialogue dataset TouchStone, consisting of open-world +images and questions, covering five major categories of abilities and 27 +subtasks. This dataset not only covers fundamental recognition and +comprehension but also extends to literary creation. Secondly, by integrating +detailed image annotations we effectively transform the multimodal input +content into a form understandable by LLMs. This enables us to employ advanced +LLMs for directly evaluating the quality of the multimodal dialogue without +requiring human intervention. Through validation, we demonstrate that powerful +LVLMs, such as GPT-4, can effectively score dialogue quality by leveraging +their textual capabilities alone, aligning with human preferences. We hope our +work can serve as a touchstone for LVLMs' evaluation and pave the way for +building stronger LVLMs. The evaluation code is available at +https://github.com/OFA-Sys/TouchStone. + +
+
+ comment: https://github.com/OFA-Sys/TouchStone +
+
+
+
+
+ + ☆ Text2Scene: Text-driven Indoor Scene Stylization with Part-aware Details CVPR 2023 + + +
+ We propose Text2Scene, a method to automatically create realistic textures +for virtual scenes composed of multiple objects. Guided by a reference image +and text descriptions, our pipeline adds detailed texture on labeled 3D +geometries in the room such that the generated colors respect the hierarchical +structure or semantic parts that are often composed of similar materials. +Instead of applying flat stylization on the entire scene at a single step, we +obtain weak semantic cues from geometric segmentation, which are further +clarified by assigning initial colors to segmented parts. Then we add texture +details for individual objects such that their projections on image space +exhibit feature embedding aligned with the embedding of the input. The +decomposition makes the entire pipeline tractable to a moderate amount of +computation resources and memory. As our framework utilizes the existing +resources of image and text embedding, it does not require dedicated datasets +with high-quality textures designed by skillful artists. To the best of our +knowledge, it is the first practical and scalable approach that can create +detailed and realistic textures of the desired style that maintain structural +context for scenes with multiple objects. + +
+
+ comment: Accepted to CVPR 2023 +
+
+
+
+
+ + ☆ SportsSloMo: A New Benchmark and Baselines for Human-centric Video Frame + Interpolation + + +
+ Human-centric video frame interpolation has great potential for improving +people's entertainment experiences and finding commercial applications in the +sports analysis industry, e.g., synthesizing slow-motion videos. Although there +are multiple benchmark datasets available in the community, none of them is +dedicated for human-centric scenarios. To bridge this gap, we introduce +SportsSloMo, a benchmark consisting of more than 130K video clips and 1M video +frames of high-resolution ($\geq$720p) slow-motion sports videos crawled from +YouTube. We re-train several state-of-the-art methods on our benchmark, and the +results show a decrease in their accuracy compared to other datasets. It +highlights the difficulty of our benchmark and suggests that it poses +significant challenges even for the best-performing methods, as human bodies +are highly deformable and occlusions are frequent in sports videos. To improve +the accuracy, we introduce two loss terms considering the human-aware priors, +where we add auxiliary supervision to panoptic segmentation and human keypoints +detection, respectively. The loss terms are model agnostic and can be easily +plugged into any video frame interpolation approaches. Experimental results +validate the effectiveness of our proposed loss terms, leading to consistent +performance improvement over 5 existing models, which establish strong baseline +models on our benchmark. The dataset and code can be found at: +https://neu-vi.github.io/SportsSlomo/. + +
+
+ comment: Project Page: https://neu-vi.github.io/SportsSlomo/ +
+
+
+
+
+ + ☆ Holistic Processing of Colour Images Using Novel Quaternion-Valued + Wavelets on the Plane + + +
+ We investigate the applicability of quaternion-valued wavelets on the plane +to holistic colour image processing. We present a methodology for decomposing +and reconstructing colour images using quaternionic wavelet filters associated +to recently developed quaternion-valued wavelets on the plane. We consider +compression, enhancement, segmentation, and denoising techniques to demonstrate +quaternion-valued wavelets as a promising tool for holistic colour image +processing. + +
+
+
+
+
+ + ☆ Self-pruning Graph Neural Network for Predicting Inflammatory Disease + Activity in Multiple Sclerosis from Brain MR Images + + +
+ Multiple Sclerosis (MS) is a severe neurological disease characterized by +inflammatory lesions in the central nervous system. Hence, predicting +inflammatory disease activity is crucial for disease assessment and treatment. +However, MS lesions can occur throughout the brain and vary in shape, size and +total count among patients. The high variance in lesion load and locations +makes it challenging for machine learning methods to learn a globally effective +representation of whole-brain MRI scans to assess and predict disease. +Technically it is non-trivial to incorporate essential biomarkers such as +lesion load or spatial proximity. Our work represents the first attempt to +utilize graph neural networks (GNN) to aggregate these biomarkers for a novel +global representation. We propose a two-stage MS inflammatory disease activity +prediction approach. First, a 3D segmentation network detects lesions, and a +self-supervised algorithm extracts their image features. Second, the detected +lesions are used to build a patient graph. The lesions act as nodes in the +graph and are initialized with image features extracted in the first stage. +Finally, the lesions are connected based on their spatial proximity and the +inflammatory disease activity prediction is formulated as a graph +classification task. Furthermore, we propose a self-pruning strategy to +auto-select the most critical lesions for prediction. Our proposed method +outperforms the existing baseline by a large margin (AUCs of 0.67 vs. 0.61 and +0.66 vs. 0.60 for one-year and two-year inflammatory disease activity, +respectively). Finally, our proposed method enjoys inherent explainability by +assigning an importance score to each lesion for the overall prediction. Code +is available at https://github.com/chinmay5/ms_ida.git + +
+
+
+
+
+ + ☆ Diffusion Models for Interferometric Satellite Aperture Radar + + +
+ Probabilistic Diffusion Models (PDMs) have recently emerged as a very +promising class of generative models, achieving high performance in natural +image generation. However, their performance relative to non-natural images, +like radar-based satellite data, remains largely unknown. Generating large +amounts of synthetic (and especially labelled) satellite data is crucial to +implement deep-learning approaches for the processing and analysis of +(interferometric) satellite aperture radar data. Here, we leverage PDMs to +generate several radar-based satellite image datasets. We show that PDMs +succeed in generating images with complex and realistic structures, but that +sampling time remains an issue. Indeed, accelerated sampling strategies, which +work well on simple image datasets like MNIST, fail on our radar datasets. We +provide a simple and versatile open-source +https://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and +evaluate PDMs using any dataset on a single GPU. + +
+
+
+
+
+ + ☆ Coarse-to-Fine Amodal Segmentation with Shape Prior ICCV 2023 + + +
+ Amodal object segmentation is a challenging task that involves segmenting +both visible and occluded parts of an object. In this paper, we propose a novel +approach, called Coarse-to-Fine Segmentation (C2F-Seg), that addresses this +problem by progressively modeling the amodal segmentation. C2F-Seg initially +reduces the learning space from the pixel-level image space to the +vector-quantized latent space. This enables us to better handle long-range +dependencies and learn a coarse-grained amodal segment from visual features and +visible segments. However, this latent space lacks detailed information about +the object, which makes it difficult to provide a precise segmentation +directly. To address this issue, we propose a convolution refine module to +inject fine-grained information and provide a more precise amodal object +segmentation based on visual features and coarse-predicted segmentation. To +help the studies of amodal object segmentation, we create a synthetic amodal +dataset, named as MOViD-Amodal (MOViD-A), which can be used for both image and +video amodal object segmentation. We extensively evaluate our model on two +benchmark datasets: KINS and COCO-A. Our empirical results demonstrate the +superiority of C2F-Seg. Moreover, we exhibit the potential of our approach for +video amodal object segmentation tasks on FISHBOWL and our proposed MOViD-A. +Project page at: http://jianxgao.github.io/C2F-Seg. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ BTSeg: Barlow Twins Regularization for Domain Adaptation in Semantic + Segmentation + + +
+ Semantic image segmentation is a critical component in many computer vision +systems, such as autonomous driving. In such applications, adverse conditions +(heavy rain, night time, snow, extreme lighting) on the one hand pose specific +challenges, yet are typically underrepresented in the available datasets. +Generating more training data is cumbersome and expensive, and the process +itself is error-prone due to the inherent aleatoric uncertainty. To address +this challenging problem, we propose BTSeg, which exploits image-level +correspondences as weak supervision signal to learn a segmentation model that +is agnostic to adverse conditions. To this end, our approach uses the Barlow +twins loss from the field of unsupervised learning and treats images taken at +the same location but under different adverse conditions as "augmentations" of +the same unknown underlying base image. This allows the training of a +segmentation model that is robust to appearance changes introduced by different +adverse conditions. We evaluate our approach on ACDC and the new challenging +ACG benchmark to demonstrate its robustness and generalization capabilities. +Our approach performs favorably when compared to the current state-of-the-art +methods, while also being simpler to implement and train. The code will be +released upon acceptance. + +
+
+
+
+
+ + ☆ Multiscale Residual Learning of Graph Convolutional Sequence Chunks for + Human Motion Prediction + + +
+ A new method is proposed for human motion prediction by learning temporal and +spatial dependencies. Recently, multiscale graphs have been developed to model +the human body at higher abstraction levels, resulting in more stable motion +prediction. Current methods however predetermine scale levels and combine +spatially proximal joints to generate coarser scales based on human priors, +even though movement patterns in different motion sequences vary and do not +fully comply with a fixed graph of spatially connected joints. Another problem +with graph convolutional methods is mode collapse, in which predicted poses +converge around a mean pose with no discernible movements, particularly in +long-term predictions. To tackle these issues, we propose ResChunk, an +end-to-end network which explores dynamically correlated body components based +on the pairwise relationships between all joints in individual sequences. +ResChunk is trained to learn the residuals between target sequence chunks in an +autoregressive manner to enforce the temporal connectivities between +consecutive chunks. It is hence a sequence-to-sequence prediction network which +considers dynamic spatio-temporal features of sequences at multiple levels. Our +experiments on two challenging benchmark datasets, CMU Mocap and Human3.6M, +demonstrate that our proposed method is able to effectively model the sequence +information for motion prediction and outperform other techniques to set a new +state-of-the-art. Our code is available at +https://github.com/MohsenZand/ResChunk. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models + + +
+ Zero-shot referring image segmentation is a challenging task because it aims +to find an instance segmentation mask based on the given referring +descriptions, without training on this type of paired data. Current zero-shot +methods mainly focus on using pre-trained discriminative models (e.g., CLIP). +However, we have observed that generative models (e.g., Stable Diffusion) have +potentially understood the relationships between various visual elements and +text descriptions, which are rarely investigated in this task. In this work, we +introduce a novel Referring Diffusional segmentor (Ref-Diff) for this task, +which leverages the fine-grained multi-modal information from generative +models. We demonstrate that without a proposal generator, a generative model +alone can achieve comparable performance to existing SOTA weakly-supervised +models. When we combine both generative and discriminative models, our Ref-Diff +outperforms these competing methods by a significant margin. This indicates +that generative models are also beneficial for this task and can complement +discriminative models for better referring segmentation. Our code is publicly +available at https://github.com/kodenii/Ref-Diff. + +
+
+
+
+
+ + ☆ Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation + Using only Images ICCV 2023 + + +
+ Generating 3D faces from textual descriptions has a multitude of +applications, such as gaming, movie, and robotics. Recent progresses have +demonstrated the success of unconditional 3D face generation and text-to-3D +shape generation. However, due to the limited text-3D face data pairs, +text-driven 3D face generation remains an open problem. In this paper, we +propose a text-guided 3D faces generation method, refer as TG-3DFace, for +generating realistic 3D faces using text guidance. Specifically, we adopt an +unconditional 3D face generation framework and equip it with text conditions, +which learns the text-guided 3D face generation with only text-2D face data. On +top of that, we propose two text-to-face cross-modal alignment techniques, +including the global contrastive learning and the fine-grained alignment +module, to facilitate high semantic consistency between generated 3D faces and +input texts. Besides, we present directional classifier guidance during the +inference process, which encourages creativity for out-of-domain generations. +Compared to the existing methods, TG-3DFace creates more realistic and +aesthetically pleasing 3D faces, boosting 9% multi-view consistency (MVIC) over +Latent3D. The rendered face images generated by TG-3DFace achieve higher FID +and CLIP score than text-to-2D face/image generation models, demonstrating our +superiority in generating realistic and semantic-consistent textures. + +
+
+ comment: accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Unsupervised CT Metal Artifact Reduction by Plugging Diffusion Priors in + Dual Domains + + +
+ During the process of computed tomography (CT), metallic implants often cause +disruptive artifacts in the reconstructed images, impeding accurate diagnosis. +Several supervised deep learning-based approaches have been proposed for +reducing metal artifacts (MAR). However, these methods heavily rely on training +with simulated data, as obtaining paired metal artifact CT and clean CT data in +clinical settings is challenging. This limitation can lead to decreased +performance when applying these methods in clinical practice. Existing +unsupervised MAR methods, whether based on learning or not, typically operate +within a single domain, either in the image domain or the sinogram domain. In +this paper, we propose an unsupervised MAR method based on the diffusion model, +a generative model with a high capacity to represent data distributions. +Specifically, we first train a diffusion model using CT images without metal +artifacts. Subsequently, we iteratively utilize the priors embedded within the +pre-trained diffusion model in both the sinogram and image domains to restore +the degraded portions caused by metal artifacts. This dual-domain processing +empowers our approach to outperform existing unsupervised MAR methods, +including another MAR method based on the diffusion model, which we have +qualitatively and quantitatively validated using synthetic datasets. Moreover, +our method demonstrates superior visual results compared to both supervised and +unsupervised methods on clinical datasets. + +
+
+
+
+
+ + ☆ Socratis: Are large multimodal models emotionally aware? ICCV 2023 + + +
+ Existing emotion prediction benchmarks contain coarse emotion labels which do +not consider the diversity of emotions that an image and text can elicit in +humans due to various reasons. Learning diverse reactions to multimodal content +is important as intelligent machines take a central role in generating and +delivering content to society. To address this gap, we propose Socratis, a +\underline{soc}ietal \underline{r}e\underline{a}c\underline{ti}on\underline{s} +benchmark, where each image-caption (IC) pair is annotated with multiple +emotions and the reasons for feeling them. Socratis contains 18K free-form +reactions for 980 emotions on 2075 image-caption pairs from 5 widely-read news +and image-caption (IC) datasets. We benchmark the capability of +state-of-the-art multimodal large language models to generate the reasons for +feeling an emotion given an IC pair. Based on a preliminary human study, we +observe that humans prefer human-written reasons over 2 times more often than +machine-generated ones. This shows our task is harder than standard generation +tasks because it starkly contrasts recent findings where humans cannot tell +apart machine vs human-written news articles, for instance. We further see that +current captioning metrics based on large vision-language models also fail to +correlate with human preferences. We hope that these findings and our benchmark +will inspire further research on training emotionally aware models. + +
+
+ comment: ICCV 2023 WECIA +
+
+
+
+
+ + ☆ Parsing is All You Need for Accurate Gait Recognition in the Wild ACM MM 2023 + + +
+ Binary silhouettes and keypoint-based skeletons have dominated human gait +recognition studies for decades since they are easy to extract from video +frames. Despite their success in gait recognition for in-the-lab environments, +they usually fail in real-world scenarios due to their low information entropy +for gait representations. To achieve accurate gait recognition in the wild, +this paper presents a novel gait representation, named Gait Parsing Sequence +(GPS). GPSs are sequences of fine-grained human segmentation, i.e., human +parsing, extracted from video frames, so they have much higher information +entropy to encode the shapes and dynamics of fine-grained human parts during +walking. Moreover, to effectively explore the capability of the GPS +representation, we propose a novel human parsing-based gait recognition +framework, named ParsingGait. ParsingGait contains a Convolutional Neural +Network (CNN)-based backbone and two light-weighted heads. The first head +extracts global semantic features from GPSs, while the other one learns mutual +information of part-level features through Graph Convolutional Networks to +model the detailed dynamics of human walking. Furthermore, due to the lack of +suitable datasets, we build the first parsing-based dataset for gait +recognition in the wild, named Gait3D-Parsing, by extending the large-scale and +challenging Gait3D dataset. Based on Gait3D-Parsing, we comprehensively +evaluate our method and existing gait recognition methods. The experimental +results show a significant improvement in accuracy brought by the GPS +representation and the superiority of ParsingGait. The code and dataset are +available at https://gait3d.github.io/gait3d-parsing-hp . + +
+
+ comment: 16 pages, 14 figures, ACM MM 2023 accepted, project page: + https://gait3d.github.io/gait3d-parsing-hp +
+
+
+
+
+ + ☆ US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for + Cervical Lymph Node Lesions Diagnoses in Ultrasound Images + + +
+ Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph +node lesions. However, the diagnoses of these images largely hinge on the +expertise of medical practitioners, rendering the process susceptible to +misdiagnoses. Although rapidly developing deep learning has substantially +improved the diagnoses of diverse ultrasound images, there remains a +conspicuous research gap concerning cervical lymph nodes. The objective of our +work is to accurately diagnose cervical lymph node lesions by leveraging a deep +learning model. To this end, we first collected 3392 images containing normal +lymph nodes, benign lymph node lesions, malignant primary lymph node lesions, +and malignant metastatic lymph node lesions. Given that ultrasound images are +generated by the reflection and scattering of sound waves across varied bodily +tissues, we proposed the Conv-FFT Block. It integrates convolutional operations +with the fast Fourier transform to more astutely model the images. Building +upon this foundation, we designed a novel architecture, named US-SFNet. This +architecture not only discerns variances in ultrasound images from the spatial +domain but also adeptly captures microstructural alterations across various +lesions in the frequency domain. To ascertain the potential of US-SFNet, we +benchmarked it against 12 popular architectures through five-fold +cross-validation. The results show that US-SFNet is SOTA and can achieve 92.89% +accuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity, +respectively. + +
+
+
+
+
+ + ☆ Post-Deployment Adaptation with Access to Source Data via Federated + Learning and Source-Target Remote Gradient Alignment MICCAI 2023 + + +
+ Deployment of Deep Neural Networks in medical imaging is hindered by +distribution shift between training data and data processed after deployment, +causing performance degradation. Post-Deployment Adaptation (PDA) addresses +this by tailoring a pre-trained, deployed model to the target data distribution +using limited labelled or entirely unlabelled target data, while assuming no +access to source training data as they cannot be deployed with the model due to +privacy concerns and their large size. This makes reliable adaptation +challenging due to limited learning signal. This paper challenges this +assumption and introduces FedPDA, a novel adaptation framework that brings the +utility of learning from remote data from Federated Learning into PDA. FedPDA +enables a deployed model to obtain information from source data via remote +gradient exchange, while aiming to optimize the model specifically for the +target domain. Tailored for FedPDA, we introduce a novel optimization method +StarAlign (Source-Target Remote Gradient Alignment) that aligns gradients +between source-target domain pairs by maximizing their inner product, to +facilitate learning a target-specific model. We demonstrate the method's +effectiveness using multi-center databases for the tasks of cancer metastases +detection and skin lesion classification, where our method compares favourably +to previous work. Code is available at: https://github.com/FelixWag/StarAlign + +
+
+ comment: This version was accepted for the Machine Learning in Medical Imaging + (MLMI 2023) workshop at MICCAI 2023 +
+
+
+
+
+ + ☆ Terrain Diffusion Network: Climatic-Aware Terrain Generation with + Geological Sketch Guidance + + +
+ Sketch-based terrain generation seeks to create realistic landscapes for +virtual environments in various applications such as computer games, animation +and virtual reality. Recently, deep learning based terrain generation has +emerged, notably the ones based on generative adversarial networks (GAN). +However, these methods often struggle to fulfill the requirements of flexible +user control and maintain generative diversity for realistic terrain. +Therefore, we propose a novel diffusion-based method, namely terrain diffusion +network (TDN), which actively incorporates user guidance for enhanced +controllability, taking into account terrain features like rivers, ridges, +basins, and peaks. Instead of adhering to a conventional monolithic denoising +process, which often compromises the fidelity of terrain details or the +alignment with user control, a multi-level denoising scheme is proposed to +generate more realistic terrains by taking into account fine-grained details, +particularly those related to climatic patterns influenced by erosion and +tectonic activities. Specifically, three terrain synthesisers are designed for +structural, intermediate, and fine-grained level denoising purposes, which +allow each synthesiser concentrate on a distinct terrain aspect. Moreover, to +maximise the efficiency of our TDN, we further introduce terrain and sketch +latent spaces for the synthesizers with pre-trained terrain autoencoders. +Comprehensive experiments on a new dataset constructed from NASA Topology +Images clearly demonstrate the effectiveness of our proposed method, achieving +the state-of-the-art performance. Our code and dataset will be publicly +available. + +
+
+
+
+
+ + ☆ Towards Vehicle-to-everything Autonomous Driving: A Survey on + Collaborative Perception + + +
+ Vehicle-to-everything (V2X) autonomous driving opens up a promising direction +for developing a new generation of intelligent transportation systems. +Collaborative perception (CP) as an essential component to achieve V2X can +overcome the inherent limitations of individual perception, including occlusion +and long-range perception. In this survey, we provide a comprehensive review of +CP methods for V2X scenarios, bringing a profound and in-depth understanding to +the community. Specifically, we first introduce the architecture and workflow +of typical V2X systems, which affords a broader perspective to understand the +entire V2X system and the role of CP within it. Then, we thoroughly summarize +and analyze existing V2X perception datasets and CP methods. Particularly, we +introduce numerous CP methods from various crucial perspectives, including +collaboration stages, roadside sensors placement, latency compensation, +performance-bandwidth trade-off, attack/defense, pose alignment, etc. Moreover, +we conduct extensive experimental analyses to compare and examine current CP +methods, revealing some essential and unexplored insights. Specifically, we +analyze the performance changes of different methods under different +bandwidths, providing a deep insight into the performance-bandwidth trade-off +issue. Also, we examine methods under different LiDAR ranges. To study the +model robustness, we further investigate the effects of various simulated +real-world noises on the performance of different CP methods, covering +communication latency, lossy communication, localization errors, and mixed +noises. In addition, we look into the sim-to-real generalization ability of +existing CP methods. At last, we thoroughly discuss issues and challenges, +highlighting promising directions for future efforts. Our codes for +experimental analysis will be public at +https://github.com/memberRE/Collaborative-Perception. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ ViLTA: Enhancing Vision-Language Pre-training through Textual + Augmentation + + +
+ Vision-language pre-training (VLP) methods are blossoming recently, and its +crucial goal is to jointly learn visual and textual features via a +transformer-based architecture, demonstrating promising improvements on a +variety of vision-language tasks. Prior arts usually focus on how to align +visual and textual features, but strategies for improving the robustness of +model and speeding up model convergence are left insufficiently explored. + In this paper, we propose a novel method ViLTA, comprising of two components +to further facilitate the model to learn fine-grained representations among +image-text pairs. For Masked Language Modeling (MLM), we propose a +cross-distillation method to generate soft labels to enhance the robustness of +model, which alleviates the problem of treating synonyms of masked words as +negative samples in one-hot labels. For Image-Text Matching (ITM), we leverage +the current language encoder to synthesize hard negatives based on the context +of language input, encouraging the model to learn high-quality representations +by increasing the difficulty of the ITM task. By leveraging the above +techniques, our ViLTA can achieve better performance on various vision-language +tasks. Extensive experiments on benchmark datasets demonstrate that the +effectiveness of ViLTA and its promising potential for vision-language +pre-training. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor + Attack + + +
+ The vulnerabilities to backdoor attacks have recently threatened the +trustworthiness of machine learning models in practical applications. +Conventional wisdom suggests that not everyone can be an attacker since the +process of designing the trigger generation algorithm often involves +significant effort and extensive experimentation to ensure the attack's +stealthiness and effectiveness. Alternatively, this paper shows that there +exists a more severe backdoor threat: anyone can exploit an easily-accessible +algorithm for silent backdoor attacks. Specifically, this attacker can employ +the widely-used lossy image compression from a plethora of compression tools to +effortlessly inject a trigger pattern into an image without leaving any +noticeable trace; i.e., the generated triggers are natural artifacts. One does +not require extensive knowledge to click on the "convert" or "save as" button +while using tools for lossy image compression. Via this attack, the adversary +does not need to design a trigger generator as seen in prior works and only +requires poisoning the data. Empirically, the proposed attack consistently +achieves 100% attack success rate in several benchmark datasets such as MNIST, +CIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still +achieve almost 100% attack success rate with very small (approximately 10%) +poisoning rates in the clean label setting. The generated trigger of the +proposed attack using one lossy compression algorithm is also transferable +across other related compression algorithms, exacerbating the severity of this +backdoor threat. This work takes another crucial step toward understanding the +extensive risks of backdoor attacks in practice, urging practitioners to +investigate similar attacks and relevant backdoor mitigation methods. + +
+
+ comment: 14 pages. This paper shows everyone can mount a powerful and stealthy + backdoor attack with the widely-used lossy image compression +
+
+
+
+
+ + ☆ Diffusion Inertial Poser: Human Motion Reconstruction from Arbitrary + Sparse IMU Configurations + + +
+ Motion capture from a limited number of inertial measurement units (IMUs) has +important applications in health, human performance, and virtual reality. +Real-world limitations and application-specific goals dictate different IMU +configurations (i.e., number of IMUs and chosen attachment body segments), +trading off accuracy and practicality. Although recent works were successful in +accurately reconstructing whole-body motion from six IMUs, these systems only +work with a specific IMU configuration. Here we propose a single diffusion +generative model, Diffusion Inertial Poser (DiffIP), which reconstructs human +motion in real-time from arbitrary IMU configurations. We show that DiffIP has +the benefit of flexibility with respect to the IMU configuration while being as +accurate as the state-of-the-art for the commonly used six IMU configuration. +Our system enables selecting an optimal configuration for different +applications without retraining the model. For example, when only four IMUs are +available, DiffIP found that the configuration that minimizes errors in joint +kinematics instruments the thighs and forearms. However, global translation +reconstruction is better when instrumenting the feet instead of the thighs. +Although our approach is agnostic to the underlying model, we built DiffIP +based on physiologically realistic musculoskeletal models to enable use in +biomedical research and health applications. + +
+
+
+
+
+ + ☆ SoccerNet 2023 Tracking Challenge -- 3rd place MOT4MOT Team Technical + Report + + +
+ The SoccerNet 2023 tracking challenge requires the detection and tracking of +soccer players and the ball. In this work, we present our approach to tackle +these tasks separately. We employ a state-of-the-art online multi-object +tracker and a contemporary object detector for player tracking. To overcome the +limitations of our online approach, we incorporate a post-processing stage +using interpolation and appearance-free track merging. Additionally, an +appearance-based track merging technique is used to handle the termination and +creation of tracks far from the image boundaries. Ball tracking is formulated +as single object detection, and a fine-tuned YOLOv8l detector with proprietary +filtering improves the detection precision. Our method achieves 3rd place on +the SoccerNet 2023 tracking challenge with a HOTA score of 66.27. + +
+
+ comment: 3 pages, 1 figure +
+
+
+
+
+ + ☆ Learning with Multi-modal Gradient Attention for Explainable Composed + Image Retrieval + + +
+ We consider the problem of composed image retrieval that takes an input query +consisting of an image and a modification text indicating the desired changes +to be made on the image and retrieves images that match these changes. Current +state-of-the-art techniques that address this problem use global features for +the retrieval, resulting in incorrect localization of the regions of interest +to be modified because of the global nature of the features, more so in cases +of real-world, in-the-wild images. Since modifier texts usually correspond to +specific local changes in an image, it is critical that models learn local +features to be able to both localize and retrieve better. To this end, our key +novelty is a new gradient-attention-based learning objective that explicitly +forces the model to focus on the local regions of interest being modified in +each retrieval step. We achieve this by first proposing a new visual image +attention computation technique, which we call multi-modal gradient attention +(MMGrad) that is explicitly conditioned on the modifier text. We next +demonstrate how MMGrad can be incorporated into an end-to-end model training +strategy with a new learning objective that explicitly forces these MMGrad +attention maps to highlight the correct local regions corresponding to the +modifier text. By training retrieval models with this new loss function, we +show improved grounding by means of better visual attention maps, leading to +better explainability of the models as well as competitive quantitative +retrieval performance on standard benchmark datasets. + +
+
+
+
+
+ + ☆ Generate Your Own Scotland: Satellite Image Generation Conditioned on + Maps + + +
+ Despite recent advancements in image generation, diffusion models still +remain largely underexplored in Earth Observation. In this paper we show that +state-of-the-art pretrained diffusion models can be conditioned on cartographic +data to generate realistic satellite images. We provide two large datasets of +paired OpenStreetMap images and satellite views over the region of Mainland +Scotland and the Central Belt. We train a ControlNet model and qualitatively +evaluate the results, demonstrating that both image quality and map fidelity +are possible. Finally, we provide some insights on the opportunities and +challenges of applying these models for remote sensing. Our model weights and +code for creating the dataset are publicly available at +https://github.com/miquel-espinosa/map-sat. + +
+
+ comment: 13 pages, 6 figures. preprint +
+
+
+
+
+ + ☆ Learning Channel Importance for High Content Imaging with Interpretable + Deep Input Channel Mixing + + +
+ Uncovering novel drug candidates for treating complex diseases remain one of +the most challenging tasks in early discovery research. To tackle this +challenge, biopharma research established a standardized high content imaging +protocol that tags different cellular compartments per image channel. In order +to judge the experimental outcome, the scientist requires knowledge about the +channel importance with respect to a certain phenotype for decoding the +underlying biology. In contrast to traditional image analysis approaches, such +experiments are nowadays preferably analyzed by deep learning based approaches +which, however, lack crucial information about the channel importance. To +overcome this limitation, we present a novel approach which utilizes +multi-spectral information of high content images to interpret a certain aspect +of cellular biology. To this end, we base our method on image blending concepts +with alpha compositing for an arbitrary number of channels. More specifically, +we introduce DCMIX, a lightweight, scaleable and end-to-end trainable mixing +layer which enables interpretable predictions in high content imaging while +retaining the benefits of deep learning based methods. We employ an extensive +set of experiments on both MNIST and RXRX1 datasets, demonstrating that DCMIX +learns the biologically relevant channel importance without scarifying +prediction performance. + +
+
+ comment: Accepted @ DAGM German Conference on Pattern Recognition (GCPR) 2023 +
+
+
+
+
+ + ☆ MFR-Net: Multi-faceted Responsive Listening Head Generation via + Denoising Diffusion Model ACM MM 2023 + + +
+ Face-to-face communication is a common scenario including roles of speakers +and listeners. Most existing research methods focus on producing speaker +videos, while the generation of listener heads remains largely overlooked. +Responsive listening head generation is an important task that aims to model +face-to-face communication scenarios by generating a listener head video given +a speaker video and a listener head image. An ideal generated responsive +listening video should respond to the speaker with attitude or viewpoint +expressing while maintaining diversity in interaction patterns and accuracy in +listener identity information. To achieve this goal, we propose the +\textbf{M}ulti-\textbf{F}aceted \textbf{R}esponsive Listening Head Generation +Network (MFR-Net). Specifically, MFR-Net employs the probabilistic denoising +diffusion model to predict diverse head pose and expression features. In order +to perform multi-faceted response to the speaker video, while maintaining +accurate listener identity preservation, we design the Feature Aggregation +Module to boost listener identity features and fuse them with other +speaker-related features. Finally, a renderer finetuned with identity +consistency loss produces the final listening head videos. Our extensive +experiments demonstrate that MFR-Net not only achieves multi-faceted responses +in diversity and speaker identity information but also in attitude and +viewpoint expression. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Semi-Supervised SAR ATR Framework with Transductive Auxiliary + Segmentation + + +
+ Convolutional neural networks (CNNs) have achieved high performance in +synthetic aperture radar (SAR) automatic target recognition (ATR). However, the +performance of CNNs depends heavily on a large amount of training data. The +insufficiency of labeled training SAR images limits the recognition performance +and even invalidates some ATR methods. Furthermore, under few labeled training +data, many existing CNNs are even ineffective. To address these challenges, we +propose a Semi-supervised SAR ATR Framework with transductive Auxiliary +Segmentation (SFAS). The proposed framework focuses on exploiting the +transductive generalization on available unlabeled samples with an auxiliary +loss serving as a regularizer. Through auxiliary segmentation of unlabeled SAR +samples and information residue loss (IRL) in training, the framework can +employ the proposed training loop process and gradually exploit the information +compilation of recognition and segmentation to construct a helpful inductive +bias and achieve high performance. Experiments conducted on the MSTAR dataset +have shown the effectiveness of our proposed SFAS for few-shot learning. The +recognition performance of 94.18\% can be achieved under 20 training samples in +each class with simultaneous accurate segmentation results. Facing variances of +EOCs, the recognition ratios are higher than 88.00\% when 10 training samples +each class. + +
+
+
+
+
+ + ☆ 3D-STMN: Dependency-Driven Superpoint-Text Matching Network for + End-to-End 3D Referring Expression Segmentation + + +
+ In 3D Referring Expression Segmentation (3D-RES), the earlier approach adopts +a two-stage paradigm, extracting segmentation proposals and then matching them +with referring expressions. However, this conventional paradigm encounters +significant challenges, most notably in terms of the generation of lackluster +initial proposals and a pronounced deceleration in inference speed. Recognizing +these limitations, we introduce an innovative end-to-end Superpoint-Text +Matching Network (3D-STMN) that is enriched by dependency-driven insights. One +of the keystones of our model is the Superpoint-Text Matching (STM) mechanism. +Unlike traditional methods that navigate through instance proposals, STM +directly correlates linguistic indications with their respective superpoints, +clusters of semantically related points. This architectural decision empowers +our model to efficiently harness cross-modal semantic relationships, primarily +leveraging densely annotated superpoint-text pairs, as opposed to the more +sparse instance-text pairs. In pursuit of enhancing the role of text in guiding +the segmentation process, we further incorporate the Dependency-Driven +Interaction (DDI) module to deepen the network's semantic comprehension of +referring expressions. Using the dependency trees as a beacon, this module +discerns the intricate relationships between primary terms and their associated +descriptors in expressions, thereby elevating both the localization and +segmentation capacities of our model. Comprehensive experiments on the +ScanRefer benchmark reveal that our model not only set new performance +standards, registering an mIoU gain of 11.7 points but also achieve a +staggering enhancement in inference speed, surpassing traditional methods by +95.7 times. The code and models are available at +https://github.com/sosppxo/3D-STMN. + +
+
+
+
+
+ + ☆ Neural Gradient Regularizer + + +
+ Owing to its significant success, the prior imposed on gradient maps has +consistently been a subject of great interest in the field of image processing. +Total variation (TV), one of the most representative regularizers, is known for +its ability to capture the sparsity of gradient maps. Nonetheless, TV and its +variants often underestimate the gradient maps, leading to the weakening of +edges and details whose gradients should not be zero in the original image. +Recently, total deep variation (TDV) has been introduced, assuming the sparsity +of feature maps, which provides a flexible regularization learned from +large-scale datasets for a specific task. However, TDV requires retraining when +the image or task changes, limiting its versatility. In this paper, we propose +a neural gradient regularizer (NGR) that expresses the gradient map as the +output of a neural network. Unlike existing methods, NGR does not rely on the +sparsity assumption, thereby avoiding the underestimation of gradient maps. NGR +is applicable to various image types and different image processing tasks, +functioning in a zero-shot learning fashion, making it a versatile and +plug-and-play regularizer. Extensive experimental results demonstrate the +superior performance of NGR over state-of-the-art counterparts for a range of +different tasks, further validating its effectiveness and versatility. + +
+
+
+
+
+ + ☆ Detecting Out-of-Context Image-Caption Pairs in News: A + Counter-Intuitive Method + + +
+ The growth of misinformation and re-contextualized media in social media and +news leads to an increasing need for fact-checking methods. Concurrently, the +advancement in generative models makes cheapfakes and deepfakes both easier to +make and harder to detect. In this paper, we present a novel approach using +generative image models to our advantage for detecting Out-of-Context (OOC) use +of images-caption pairs in news. We present two new datasets with a total of +$6800$ images generated using two different generative models including (1) +DALL-E 2, and (2) Stable-Diffusion. We are confident that the method proposed +in this paper can further research on generative models in the field of +cheapfake detection, and that the resulting datasets can be used to train and +evaluate new models aimed at detecting cheapfakes. We run a preliminary +qualitative and quantitative analysis to evaluate the performance of each image +generation model for this task, and evaluate a handful of methods for computing +image similarity. + +
+
+ comment: ACM International Conference on Content-Based Multimedia Indexing + (CBMI '23) +
+
+
+
+
+ + ☆ Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation + + +
+ Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential +role in the early diagnosis and treatment of liver cancer. Deep learning models +backboned by fully convolutional neural networks (FCNNs) have become the +dominant model for segmenting 3D computerized tomography (CT) scans. However, +since their convolution layers suffer from limited kernel size, they are not +able to capture long-range dependencies and global context. To tackle this +restriction, vision transformers have been introduced to solve FCNN's locality +of receptive fields. Although transformers can capture long-range features, +their segmentation performance decreases with various tumor sizes due to the +model sensitivity to the input patch size. While finding an optimal patch size +improves the performance of vision transformer-based models on segmentation +tasks, it is a time-consuming and challenging procedure. This paper proposes a +technique to select the vision transformer's optimal input multi-resolution +image patch size based on the average volume size of metastasis lesions. We +further validated our suggested framework using a transfer-learning technique, +demonstrating that the highest Dice similarity coefficient (DSC) performance +was obtained by pre-training on training data with a larger tumour volume using +the suggested ideal patch size and then training with a smaller one. We +experimentally evaluate this idea through pre-training our model on a +multi-resolution public dataset. Our model showed consistent and improved +results when applied to our private multi-resolution mCRC dataset with a +smaller average tumor volume. This study lays the groundwork for optimizing +semantic segmentation of small objects using vision transformers. The +implementation source code is available +at:https://github.com/Ramtin-Mojtahedi/OVTPS. + +
+
+
+
+
+ + ☆ Any-Size-Diffusion: Toward Efficient Text-Driven Synthesis for Any-Size + HD Images + + +
+ Stable diffusion, a generative model used in text-to-image synthesis, +frequently encounters resolution-induced composition problems when generating +images of varying sizes. This issue primarily stems from the model being +trained on pairs of single-scale images and their corresponding text +descriptions. Moreover, direct training on images of unlimited sizes is +unfeasible, as it would require an immense number of text-image pairs and +entail substantial computational expenses. To overcome these challenges, we +propose a two-stage pipeline named Any-Size-Diffusion (ASD), designed to +efficiently generate well-composed images of any size, while minimizing the +need for high-memory GPU resources. Specifically, the initial stage, dubbed Any +Ratio Adaptability Diffusion (ARAD), leverages a selected set of images with a +restricted range of ratios to optimize the text-conditional diffusion model, +thereby improving its ability to adjust composition to accommodate diverse +image sizes. To support the creation of images at any desired size, we further +introduce a technique called Fast Seamless Tiled Diffusion (FSTD) at the +subsequent stage. This method allows for the rapid enlargement of the ASD +output to any high-resolution size, avoiding seaming artifacts or memory +overloads. Experimental results on the LAION-COCO and MM-CelebA-HQ benchmarks +demonstrate that ASD can produce well-structured images of arbitrary sizes, +cutting down the inference time by 2x compared to the traditional tiled +algorithm. + +
+
+
+
+
+ + ☆ GHuNeRF: Generalizable Human NeRF from a Monocular Video + + +
+ In this paper, we tackle the challenging task of learning a generalizable +human NeRF model from a monocular video. Although existing generalizable human +NeRFs have achieved impressive results, they require muti-view images or videos +which might not be always available. On the other hand, some works on +free-viewpoint rendering of human from monocular videos cannot be generalized +to unseen identities. In view of these limitations, we propose GHuNeRF to learn +a generalizable human NeRF model from a monocular video of the human performer. +We first introduce a visibility-aware aggregation scheme to compute vertex-wise +features, which is used to construct a 3D feature volume. The feature volume +can only represent the overall geometry of the human performer with +insufficient accuracy due to the limited resolution. To solve this, we further +enhance the volume feature with temporally aligned point-wise features using an +attention mechanism. Finally, the enhanced feature is used for predicting +density and color for each sampled point. A surface-guided sampling strategy is +also introduced to improve the efficiency for both training and inference. We +validate our approach on the widely-used ZJU-MoCap dataset, where we achieve +comparable performance with existing multi-view video based approaches. We also +test on the monocular People-Snapshot dataset and achieve better performance +than existing works when only monocular video is used. + +
+
+
+
+
+ + ☆ Dual-Decoder Consistency via Pseudo-Labels Guided Data Augmentation for + Semi-Supervised Medical Image Segmentation + + +
+ Medical image segmentation methods often rely on fully supervised approaches +to achieve excellent performance, which is contingent upon having an extensive +set of labeled images for training. However, annotating medical images is both +expensive and time-consuming. Semi-supervised learning offers a solution by +leveraging numerous unlabeled images alongside a limited set of annotated ones. +In this paper, we introduce a semi-supervised medical image segmentation method +based on the mean-teacher model, referred to as Dual-Decoder Consistency via +Pseudo-Labels Guided Data Augmentation (DCPA). This method combines consistency +regularization, pseudo-labels, and data augmentation to enhance the efficacy of +semi-supervised segmentation. Firstly, the proposed model comprises both +student and teacher models with a shared encoder and two distinct decoders +employing different up-sampling strategies. Minimizing the output discrepancy +between decoders enforces the generation of consistent representations, serving +as regularization during student model training. Secondly, we introduce mixup +operations to blend unlabeled data with labeled data, creating mixed data and +thereby achieving data augmentation. Lastly, pseudo-labels are generated by the +teacher model and utilized as labels for mixed data to compute unsupervised +loss. We compare the segmentation results of the DCPA model with six +state-of-the-art semi-supervised methods on three publicly available medical +datasets. Beyond classical 10\% and 20\% semi-supervised settings, we +investigate performance with less supervision (5\% labeled data). Experimental +outcomes demonstrate that our approach consistently outperforms existing +semi-supervised medical image segmentation methods across the three +semi-supervised settings. + +
+
+
+
+
+ + ☆ CL-MAE: Curriculum-Learned Masked Autoencoders + + +
+ Masked image modeling has been demonstrated as a powerful pretext task for +generating robust representations that can be effectively generalized across +multiple downstream tasks. Typically, this approach involves randomly masking +patches (tokens) in input images, with the masking strategy remaining unchanged +during training. In this paper, we propose a curriculum learning approach that +updates the masking strategy to continually increase the complexity of the +self-supervised reconstruction task. We conjecture that, by gradually +increasing the task complexity, the model can learn more sophisticated and +transferable representations. To facilitate this, we introduce a novel +learnable masking module that possesses the capability to generate masks of +different complexities, and integrate the proposed module into masked +autoencoders (MAE). Our module is jointly trained with the MAE, while adjusting +its behavior during training, transitioning from a partner to the MAE +(optimizing the same reconstruction loss) to an adversary (optimizing the +opposite loss), while passing through a neutral state. The transition between +these behaviors is smooth, being regulated by a factor that is multiplied with +the reconstruction loss of the masking module. The resulting training procedure +generates an easy-to-hard curriculum. We train our Curriculum-Learned Masked +Autoencoder (CL-MAE) on ImageNet and show that it exhibits superior +representation learning capabilities compared to MAE. The empirical results on +five downstream tasks confirm our conjecture, demonstrating that curriculum +learning can be successfully used to self-supervise masked autoencoders. + +
+
+
+
+
+ + ☆ Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based + Approach + + +
+ In the rapidly evolving digital era, the analysis of document layouts plays a +pivotal role in automated information extraction and interpretation. In our +work, we have trained MViTv2 transformer model architecture with cascaded mask +R-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from +a document. After training on 20365 document images for 36 epochs in a 3 phase +cycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work +extends beyond training, delving into the exploration of potential enhancement +avenues. We investigate the impact of rotation and flip augmentation, the +effectiveness of slicing input images pre-inference, the implications of +varying the resolution of the transformer backbone, and the potential of +employing a dual-pass inference to uncover missed text-boxes. Through these +explorations, we observe a spectrum of outcomes, where some modifications +result in tangible performance improvements, while others offer unique insights +for future endeavors. + +
+
+
+
+
+ + ☆ Shape of my heart: Cardiac models through learned signed distance + functions + + +
+ The efficient construction of an anatomical model is one of the major +challenges of patient-specific in-silico models of the human heart. Current +methods frequently rely on linear statistical models, allowing no advanced +topological changes, or requiring medical image segmentation followed by a +meshing pipeline, which strongly depends on image resolution, quality, and +modality. These approaches are therefore limited in their transferability to +other imaging domains. In this work, the cardiac shape is reconstructed by +means of three-dimensional deep signed distance functions with Lipschitz +regularity. For this purpose, the shapes of cardiac MRI reconstructions are +learned from public databases to model the spatial relation of multiple +chambers in Cartesian space. We demonstrate that this approach is also capable +of reconstructing anatomical models from partial data, such as point clouds +from a single ventricle, or modalities different from the trained MRI, such as +electroanatomical mapping, and in addition, allows us to generate new +anatomical shapes by randomly sampling latent vectors. + +
+
+
+
+
+ + ☆ ScrollNet: Dynamic Weight Importance for Continual Learning ICCV2023 + + +
+ The principle underlying most existing continual learning (CL) methods is to +prioritize stability by penalizing changes in parameters crucial to old tasks, +while allowing for plasticity in other parameters. The importance of weights +for each task can be determined either explicitly through learning a +task-specific mask during training (e.g., parameter isolation-based approaches) +or implicitly by introducing a regularization term (e.g., regularization-based +approaches). However, all these methods assume that the importance of weights +for each task is unknown prior to data exposure. In this paper, we propose +ScrollNet as a scrolling neural network for continual learning. ScrollNet can +be seen as a dynamic network that assigns the ranking of weight importance for +each task before data exposure, thus achieving a more favorable +stability-plasticity tradeoff during sequential task learning by reassigning +this ranking for different tasks. Additionally, we demonstrate that ScrollNet +can be combined with various CL methods, including regularization-based and +replay-based approaches. Experimental results on CIFAR100 and TinyImagenet +datasets show the effectiveness of our proposed method. We release our code at +https://github.com/FireFYF/ScrollNet.git. + +
+
+ comment: Accepted at Visual Continual Learning workshop (ICCV2023) +
+
+
+
+
+ + ☆ MoMA: Momentum Contrastive Learning with Multi-head Attention-based + Knowledge Distillation for Histopathology Image Analysis + + +
+ There is no doubt that advanced artificial intelligence models and high +quality data are the keys to success in developing computational pathology +tools. Although the overall volume of pathology data keeps increasing, a lack +of quality data is a common issue when it comes to a specific task due to +several reasons including privacy and ethical issues with patient data. In this +work, we propose to exploit knowledge distillation, i.e., utilize the existing +model to learn a new, target model, to overcome such issues in computational +pathology. Specifically, we employ a student-teacher framework to learn a +target model from a pre-trained, teacher model without direct access to source +data and distill relevant knowledge via momentum contrastive learning with +multi-head attention mechanism, which provides consistent and context-aware +feature representations. This enables the target model to assimilate +informative representations of the teacher model while seamlessly adapting to +the unique nuances of the target data. The proposed method is rigorously +evaluated across different scenarios where the teacher model was trained on the +same, relevant, and irrelevant classification tasks with the target model. +Experimental results demonstrate the accuracy and robustness of our approach in +transferring knowledge to different domains and tasks, outperforming other +related methods. Moreover, the results provide a guideline on the learning +strategy for different types of tasks and scenarios in computational pathology. +Code is available at: \url{https://github.com/trinhvg/MoMA}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ E3CM: Epipolar-Constrained Cascade Correspondence Matching + + +
+ Accurate and robust correspondence matching is of utmost importance for +various 3D computer vision tasks. However, traditional explicit +programming-based methods often struggle to handle challenging scenarios, and +deep learning-based methods require large well-labeled datasets for network +training. In this article, we introduce Epipolar-Constrained Cascade +Correspondence (E3CM), a novel approach that addresses these limitations. +Unlike traditional methods, E3CM leverages pre-trained convolutional neural +networks to match correspondence, without requiring annotated data for any +network training or fine-tuning. Our method utilizes epipolar constraints to +guide the matching process and incorporates a cascade structure for progressive +refinement of matches. We extensively evaluate the performance of E3CM through +comprehensive experiments and demonstrate its superiority over existing +methods. To promote further research and facilitate reproducibility, we make +our source code publicly available at https://mias.group/E3CM. + +
+
+ comment: accepted to Neurocomputing +
+
+
+
+
+ + ☆ Prompt-enhanced Hierarchical Transformer Elevating Cardiopulmonary + Resuscitation Instruction via Temporal Action Segmentation + + +
+ The vast majority of people who suffer unexpected cardiac arrest are +performed cardiopulmonary resuscitation (CPR) by passersby in a desperate +attempt to restore life, but endeavors turn out to be fruitless on account of +disqualification. Fortunately, many pieces of research manifest that +disciplined training will help to elevate the success rate of resuscitation, +which constantly desires a seamless combination of novel techniques to yield +further advancement. To this end, we collect a custom CPR video dataset in +which trainees make efforts to behave resuscitation on mannequins independently +in adherence to approved guidelines, thereby devising an auxiliary toolbox to +assist supervision and rectification of intermediate potential issues via +modern deep learning methodologies. Our research empirically views this problem +as a temporal action segmentation (TAS) task in computer vision, which aims to +segment an untrimmed video at a frame-wise level. Here, we propose a +Prompt-enhanced hierarchical Transformer (PhiTrans) that integrates three +indispensable modules, including a textual prompt-based Video Features +Extractor (VFE), a transformer-based Action Segmentation Executor (ASE), and a +regression-based Prediction Refinement Calibrator (PRC). The backbone of the +model preferentially derives from applications in three approved public +datasets (GTEA, 50Salads, and Breakfast) collected for TAS tasks, which +accounts for the excavation of the segmentation pipeline on the CPR dataset. In +general, we unprecedentedly probe into a feasible pipeline that genuinely +elevates the CPR instruction qualification via action segmentation in +conjunction with cutting-edge deep learning techniques. Associated experiments +advocate our implementation with multiple metrics surpassing 91.0%. + +
+
+ comment: Transformer for Cardiopulmonary Resuscitation +
+
+
+
+
+ + ☆ Object Detection for Caries or Pit and Fissure Sealing Requirement in + Children's First Permanent Molars + + +
+ Dental caries is one of the most common oral diseases that, if left +untreated, can lead to a variety of oral problems. It mainly occurs inside the +pits and fissures on the occlusal/buccal/palatal surfaces of molars and +children are a high-risk group for pit and fissure caries in permanent molars. +Pit and fissure sealing is one of the most effective methods that is widely +used in prevention of pit and fissure caries. However, current detection of +pits and fissures or caries depends primarily on the experienced dentists, +which ordinary parents do not have, and children may miss the remedial +treatment without timely detection. To address this issue, we present a method +to autodetect caries and pit and fissure sealing requirements using oral photos +taken by smartphones. We use the YOLOv5 and YOLOX models and adopt a tiling +strategy to reduce information loss during image pre-processing. The best +result for YOLOXs model with tiling strategy is 72.3 mAP.5, while the best +result without tiling strategy is 71.2. YOLOv5s6 model with/without tiling +attains 70.9/67.9 mAP.5, respectively. We deploy the pre-trained network to +mobile devices as a WeChat applet, allowing in-home detection by parents or +children guardian. + +
+
+
+
+
+ + ☆ Decoupled Local Aggregation for Point Cloud Learning + + +
+ The unstructured nature of point clouds demands that local aggregation be +adaptive to different local structures. Previous methods meet this by +explicitly embedding spatial relations into each aggregation process. Although +this coupled approach has been shown effective in generating clear semantics, +aggregation can be greatly slowed down due to repeated relation learning and +redundant computation to mix directional and point features. In this work, we +propose to decouple the explicit modelling of spatial relations from local +aggregation. We theoretically prove that basic neighbor pooling operations can +too function without loss of clarity in feature fusion, so long as essential +spatial information has been encoded in point features. As an instantiation of +decoupled local aggregation, we present DeLA, a lightweight point network, +where in each learning stage relative spatial encodings are first formed, and +only pointwise convolutions plus edge max-pooling are used for local +aggregation then. Further, a regularization term is employed to reduce +potential ambiguity through the prediction of relative coordinates. +Conceptually simple though, experimental results on five classic benchmarks +demonstrate that DeLA achieves state-of-the-art performance with reduced or +comparable latency. Specifically, DeLA achieves over 90\% overall accuracy on +ScanObjectNN and 74\% mIoU on S3DIS Area 5. Our code is available at +https://github.com/Matrix-ASC/DeLA . + +
+
+
+
+
+ + ☆ Privacy-Preserving Medical Image Classification through Deep Learning + and Matrix Decomposition + + +
+ Deep learning (DL)-based solutions have been extensively researched in the +medical domain in recent years, enhancing the efficacy of diagnosis, planning, +and treatment. Since the usage of health-related data is strictly regulated, +processing medical records outside the hospital environment for developing and +using DL models demands robust data protection measures. At the same time, it +can be challenging to guarantee that a DL solution delivers a minimum level of +performance when being trained on secured data, without being specifically +designed for the given task. Our approach uses singular value decomposition +(SVD) and principal component analysis (PCA) to obfuscate the medical images +before employing them in the DL analysis. The capability of DL algorithms to +extract relevant information from secured data is assessed on a task of +angiographic view classification based on obfuscated frames. The security level +is probed by simulated artificial intelligence (AI)-based reconstruction +attacks, considering two threat actors with different prior knowledge of the +targeted data. The degree of privacy is quantitatively measured using +similarity indices. Although a trade-off between privacy and accuracy should be +considered, the proposed technique allows for training the angiographic view +classifier exclusively on secured data with satisfactory performance and with +no computational overhead, model adaptation, or hyperparameter tuning. While +the obfuscated medical image content is well protected against human +perception, the hypothetical reconstruction attack proved that it is also +difficult to recover the complete information of the original frames. + +
+
+ comment: 6 pages, 9 figures, Published in: 2023 31st Mediterranean Conference + on Control and Automation (MED) +
+
+
+
+
+ + ☆ SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded + Objects + + +
+ To enable meaningful robotic manipulation of objects in the real-world, 6D +pose estimation is one of the critical aspects. Most existing approaches have +difficulties to extend predictions to scenarios where novel object instances +are continuously introduced, especially with heavy occlusions. In this work, we +propose a few-shot pose estimation (FSPE) approach called SA6D, which uses a +self-adaptive segmentation module to identify the novel target object and +construct a point cloud model of the target object using only a small number of +cluttered reference images. Unlike existing methods, SA6D does not require +object-centric reference images or any additional object information, making it +a more generalizable and scalable solution across categories. We evaluate SA6D +on real-world tabletop object datasets and demonstrate that SA6D outperforms +existing FSPE methods, particularly in cluttered scenes with occlusions, while +requiring fewer reference images. + +
+
+
+
+
+ + ☆ Unsupervised Recognition of Unknown Objects for Open-World Object + Detection + + +
+ Open-World Object Detection (OWOD) extends object detection problem to a +realistic and dynamic scenario, where a detection model is required to be +capable of detecting both known and unknown objects and incrementally learning +newly introduced knowledge. Current OWOD models, such as ORE and OW-DETR, focus +on pseudo-labeling regions with high objectness scores as unknowns, whose +performance relies heavily on the supervision of known objects. While they can +detect the unknowns that exhibit similar features to the known objects, they +suffer from a severe label bias problem that they tend to detect all regions +(including unknown object regions) that are dissimilar to the known objects as +part of the background. To eliminate the label bias, this paper proposes a +novel approach that learns an unsupervised discriminative model to recognize +true unknown objects from raw pseudo labels generated by unsupervised region +proposal methods. The resulting model can be further refined by a +classification-free self-training method which iteratively extends pseudo +unknown objects to the unlabeled regions. Experimental results show that our +method 1) significantly outperforms the prior SOTA in detecting unknown objects +while maintaining competitive performance of detecting known object classes on +the MS COCO dataset, and 2) achieves better generalization ability on the LVIS +and Objects365 datasets. + +
+
+
+
+
+ + ☆ MS23D: A 3D Object Detection Method Using Multi-Scale Semantic Feature + Points to Construct 3D Feature Layers + + +
+ Lidar point clouds, as a type of data with accurate distance perception, can +effectively represent the motion and posture of objects in three-dimensional +space. However, the sparsity and disorderliness of point clouds make it +challenging to extract features directly from them. Many studies have addressed +this issue by transforming point clouds into regular voxel representations. +However, these methods often lead to the loss of fine-grained local feature +information due to downsampling. Moreover, the sparsity of point clouds poses +difficulties in efficiently aggregating features in 3D feature layers using +voxel-based two-stage methods. To address these issues, this paper proposes a +two-stage 3D detection framework called MS$^{2}$3D. In MS$^{2}$3D, we utilize +small-sized voxels to extract fine-grained local features and large-sized +voxels to capture long-range local features. Additionally, we propose a method +for constructing 3D feature layers using multi-scale semantic feature points, +enabling the transformation of sparse 3D feature layers into more compact +representations. Furthermore, we compute the offset between feature points in +the 3D feature layers and the centroid of objects, aiming to bring them as +close as possible to the object's center. It significantly enhances the +efficiency of feature aggregation. To validate the effectiveness of our method, +we evaluated our method on the KITTI dataset and ONCE dataset together. + +
+
+
+
+
+ + ☆ MVDream: Multi-view Diffusion for 3D Generation + + +
+ We propose MVDream, a multi-view diffusion model that is able to generate +geometrically consistent multi-view images from a given text prompt. By +leveraging image diffusion models pre-trained on large-scale web datasets and a +multi-view dataset rendered from 3D assets, the resulting multi-view diffusion +model can achieve both the generalizability of 2D diffusion and the consistency +of 3D data. Such a model can thus be applied as a multi-view prior for 3D +generation via Score Distillation Sampling, where it greatly improves the +stability of existing 2D-lifting methods by solving the 3D consistency problem. +Finally, we show that the multi-view diffusion model can also be fine-tuned +under a few shot setting for personalized 3D generation, i.e. DreamBooth3D +application, where the consistency can be maintained after learning the subject +identity. + +
+
+ comment: Our project page is https://MV-Dream.github.io +
+
+
+
+
+ + ☆ Robust GAN inversion + + +
+ Recent advancements in real image editing have been attributed to the +exploration of Generative Adversarial Networks (GANs) latent space. However, +the main challenge of this procedure is GAN inversion, which aims to map the +image to the latent space accurately. Existing methods that work on extended +latent space $W+$ are unable to achieve low distortion and high editability +simultaneously. To address this issue, we propose an approach which works in +native latent space $W$ and tunes the generator network to restore missing +image details. We introduce a novel regularization strategy with learnable +coefficients obtained by training randomized StyleGAN 2 model - WRanGAN. This +method outperforms traditional approaches in terms of reconstruction quality +and computational efficiency, achieving the lowest distortion with 4 times +fewer parameters. Furthermore, we observe a slight improvement in the quality +of constructing hyperplanes corresponding to binary image attributes. We +demonstrate the effectiveness of our approach on two complex datasets: +Flickr-Faces-HQ and LSUN Church. + +
+
+ comment: 22 pages, 28 figures +
+
+
+
+
+ + ☆ Latent Painter + + +
+ Latent diffusers revolutionized the generative AI and inspired creative art. +When denoising the latent, the predicted original image at each step +collectively animates the formation. However, the animation is limited by the +denoising nature of the diffuser, and only renders a sharpening process. This +work presents Latent Painter, which uses the latent as the canvas, and the +diffuser predictions as the plan, to generate painting animation. Latent +Painter also transits one generated image to another, which can happen between +images from two different sets of checkpoints. + +
+
+
+
+
+ + ☆ Illumination Distillation Framework for Nighttime Person + Re-Identification and A New Benchmark + + +
+ Nighttime person Re-ID (person re-identification in the nighttime) is a very +important and challenging task for visual surveillance but it has not been +thoroughly investigated. Under the low illumination condition, the performance +of person Re-ID methods usually sharply deteriorates. To address the low +illumination challenge in nighttime person Re-ID, this paper proposes an +Illumination Distillation Framework (IDF), which utilizes illumination +enhancement and illumination distillation schemes to promote the learning of +Re-ID models. Specifically, IDF consists of a master branch, an illumination +enhancement branch, and an illumination distillation module. The master branch +is used to extract the features from a nighttime image. The illumination +enhancement branch first estimates an enhanced image from the nighttime image +using a nonlinear curve mapping method and then extracts the enhanced features. +However, nighttime and enhanced features usually contain data noise due to +unstable lighting conditions and enhancement failures. To fully exploit the +complementary benefits of nighttime and enhanced features while suppressing +data noise, we propose an illumination distillation module. In particular, the +illumination distillation module fuses the features from two branches through a +bottleneck fusion model and then uses the fused features to guide the learning +of both branches in a distillation manner. In addition, we build a real-world +nighttime person Re-ID dataset, named Night600, which contains 600 identities +captured from different viewpoints and nighttime illumination conditions under +complex outdoor environments. Experimental results demonstrate that our IDF can +achieve state-of-the-art performance on two nighttime person Re-ID datasets +(i.e., Night600 and Knight ). We will release our code and dataset at +https://github.com/Alexadlu/IDF. + +
+
+ comment: Accepted by TMM +
+
+
+
+
+ + ☆ Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning + + +
+ Affordable 3D scanners often produce sparse and non-uniform point clouds that +negatively impact downstream applications in robotic systems. While existing +point cloud upsampling architectures have demonstrated promising results on +standard benchmarks, they tend to experience significant performance drops when +the test data have different distributions from the training data. To address +this issue, this paper proposes a test-time adaption approach to enhance model +generality of point cloud upsampling. The proposed approach leverages +meta-learning to explicitly learn network parameters for test-time adaption. +Our method does not require any prior information about the test data. During +meta-training, the model parameters are learned from a collection of +instance-level tasks, each of which consists of a sparse-dense pair of point +clouds from the training data. During meta-testing, the trained model is +fine-tuned with a few gradient updates to produce a unique set of network +parameters for each test instance. The updated model is then used for the final +prediction. Our framework is generic and can be applied in a plug-and-play +manner with existing backbone networks in point cloud upsampling. Extensive +experiments demonstrate that our approach improves the performance of +state-of-the-art models. + +
+
+
+
+
+ + ☆ Point-TTA: Test-Time Adaptation for Point Cloud Registration Using + Multitask Meta-Auxiliary Learning + + +
+ We present Point-TTA, a novel test-time adaptation framework for point cloud +registration (PCR) that improves the generalization and the performance of +registration models. While learning-based approaches have achieved impressive +progress, generalization to unknown testing environments remains a major +challenge due to the variations in 3D scans. Existing methods typically train a +generic model and the same trained model is applied on each instance during +testing. This could be sub-optimal since it is difficult for the same model to +handle all the variations during testing. In this paper, we propose a test-time +adaptation approach for PCR. Our model can adapt to unseen distributions at +test-time without requiring any prior knowledge of the test data. Concretely, +we design three self-supervised auxiliary tasks that are optimized jointly with +the primary PCR task. Given a test instance, we adapt our model using these +auxiliary tasks and the updated model is used to perform the inference. During +training, our model is trained using a meta-auxiliary learning approach, such +that the adapted model via auxiliary tasks improves the accuracy of the primary +task. Experimental results demonstrate the effectiveness of our approach in +improving generalization of point cloud registration and outperforming other +state-of-the-art approaches. + +
+
+
+
+
+ + ☆ PivotNet: Vectorized Pivot Learning for End-to-end HD Map Construction ICCV2023 + + +
+ Vectorized high-definition map online construction has garnered considerable +attention in the field of autonomous driving research. Most existing approaches +model changeable map elements using a fixed number of points, or predict local +maps in a two-stage autoregressive manner, which may miss essential details and +lead to error accumulation. Towards precise map element learning, we propose a +simple yet effective architecture named PivotNet, which adopts unified +pivot-based map representations and is formulated as a direct set prediction +paradigm. Concretely, we first propose a novel Point-to-Line Mask module to +encode both the subordinate and geometrical point-line priors in the network. +Then, a well-designed Pivot Dynamic Matching module is proposed to model the +topology in dynamic point sequences by introducing the concept of sequence +matching. Furthermore, to supervise the position and topology of the vectorized +point predictions, we propose a Dynamic Vectorized Sequence loss. Extensive +experiments and ablations show that PivotNet is remarkably superior to other +SOTAs by 5.9 mAP at least. The code will be available soon. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Self-Sampling Meta SAM: Enhancing Few-shot Medical Image Segmentation + with Meta-Learning + + +
+ While the Segment Anything Model (SAM) excels in semantic segmentation for +general-purpose images, its performance significantly deteriorates when applied +to medical images, primarily attributable to insufficient representation of +medical images in its training dataset. Nonetheless, gathering comprehensive +datasets and training models that are universally applicable is particularly +challenging due to the long-tail problem common in medical images. To address +this gap, here we present a Self-Sampling Meta SAM (SSM-SAM) framework for +few-shot medical image segmentation. Our innovation lies in the design of three +key modules: 1) An online fast gradient descent optimizer, further optimized by +a meta-learner, which ensures swift and robust adaptation to new tasks. 2) A +Self-Sampling module designed to provide well-aligned visual prompts for +improved attention allocation; and 3) A robust attention-based decoder +specifically designed for medical few-shot learning to capture relationship +between different slices. Extensive experiments on a popular abdominal CT +dataset and an MRI dataset demonstrate that the proposed method achieves +significant improvements over state-of-the-art methods in few-shot +segmentation, with an average improvements of 10.21% and 1.80% in terms of DSC, +respectively. In conclusion, we present a novel approach for rapid online +adaptation in interactive image segmentation, adapting to a new organ in just +0.83 minutes. Code is publicly available on GitHub upon acceptance. + +
+
+
+
+
+ + ☆ Sparkles: Unlocking Chats Across Multiple Images for Multimodal + Instruction-Following Models + + +
+ Large language models exhibit enhanced zero-shot performance on various tasks +when fine-tuned with instruction-following data. Multimodal +instruction-following models extend these capabilities by integrating both text +and images. However, existing models such as MiniGPT-4 face challenges in +maintaining dialogue coherence in scenarios involving multiple images. A +primary reason is the lack of a specialized dataset for this critical +application. To bridge these gaps, we present SparklesChat, a multimodal +instruction-following model for open-ended dialogues across multiple images. To +support the training, we introduce SparklesDialogue, the first +machine-generated dialogue dataset tailored for word-level interleaved +multi-image and text interactions. Furthermore, we construct SparklesEval, a +GPT-assisted benchmark for quantitatively assessing a model's conversational +competence across multiple images and dialogue turns. Our experiments validate +the effectiveness of SparklesChat in understanding and reasoning across +multiple images and dialogue turns. Specifically, SparklesChat outperformed +MiniGPT-4 on established vision-and-language benchmarks, including the BISON +binary image selection task and the NLVR2 visual reasoning task. Moreover, +SparklesChat scored 8.56 out of 10 on SparklesEval, substantially exceeding +MiniGPT-4's score of 3.91 and nearing GPT-4's score of 9.26. Qualitative +evaluations further demonstrate SparklesChat's generality in handling +real-world applications. All resources will be available at +https://github.com/HYPJUDY/Sparkles. + +
+
+
+
+
+ + ☆ Domain Adaptive Synapse Detection with Weak Point Annotations + + +
+ The development of learning-based methods has greatly improved the detection +of synapses from electron microscopy (EM) images. However, training a model for +each dataset is time-consuming and requires extensive annotations. +Additionally, it is difficult to apply a learned model to data from different +brain regions due to variations in data distributions. In this paper, we +present AdaSyn, a two-stage segmentation-based framework for domain adaptive +synapse detection with weak point annotations. In the first stage, we address +the detection problem by utilizing a segmentation-based pipeline to obtain +synaptic instance masks. In the second stage, we improve model generalizability +on target data by regenerating square masks to get high-quality pseudo labels. +Benefiting from our high-accuracy detection results, we introduce the distance +nearest principle to match paired pre-synapses and post-synapses. In the +WASPSYN challenge at ISBI 2023, our method ranks the 1st place. + +
+
+
+
+
+ + ☆ Improving Lens Flare Removal with General Purpose Pipeline and Multiple + Light Sources Recovery ICCV 2023 + + +
+ When taking images against strong light sources, the resulting images often +contain heterogeneous flare artifacts. These artifacts can importantly affect +image visual quality and downstream computer vision tasks. While collecting +real data pairs of flare-corrupted/flare-free images for training flare removal +models is challenging, current methods utilize the direct-add approach to +synthesize data. However, these methods do not consider automatic exposure and +tone mapping in image signal processing pipeline (ISP), leading to the limited +generalization capability of deep models training using such data. Besides, +existing methods struggle to handle multiple light sources due to the different +sizes, shapes and illuminance of various light sources. In this paper, we +propose a solution to improve the performance of lens flare removal by +revisiting the ISP and remodeling the principle of automatic exposure in the +synthesis pipeline and design a more reliable light sources recovery strategy. +The new pipeline approaches realistic imaging by discriminating the local and +global illumination through convex combination, avoiding global illumination +shifting and local over-saturation. Our strategy for recovering multiple light +sources convexly averages the input and output of the neural network based on +illuminance levels, thereby avoiding the need for a hard threshold in +identifying light sources. We also contribute a new flare removal testing +dataset containing the flare-corrupted images captured by ten types of consumer +electronics. The dataset facilitates the verification of the generalization +capability of flare removal methods. Extensive experiments show that our +solution can effectively improve the performance of lens flare removal and push +the frontier toward more general situations. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Adversarial Finetuning with Latent Representation Constraint to Mitigate + Accuracy-Robustness Tradeoff ICCV + + +
+ This paper addresses the tradeoff between standard accuracy on clean examples +and robustness against adversarial examples in deep neural networks (DNNs). +Although adversarial training (AT) improves robustness, it degrades the +standard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we +propose a novel AT method called ARREST, which comprises three components: (i) +adversarial finetuning (AFT), (ii) representation-guided knowledge distillation +(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples +by initializing its parameters with a DNN that is standardly pretrained on +clean examples. RGKD and NR respectively entail a regularization term and an +algorithm to preserve latent representations of clean examples during AFT. RGKD +penalizes the distance between the representations of the standardly pretrained +and AFT DNNs. NR switches input adversarial examples to nonadversarial ones +when the representation changes significantly during AFT. By combining these +components, ARREST achieves both high standard accuracy and robustness. +Experimental results demonstrate that ARREST mitigates the tradeoff more +effectively than previous AT-based methods do. + +
+
+ comment: Accepted by International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Njobvu-AI: An open-source tool for collaborative image labeling and + implementation of computer vision models + + +
+ Practitioners interested in using computer vision models lack user-friendly +and open-source software that combines features to label training data, allow +multiple users, train new algorithms, review output, and implement new models. +Labeling training data, such as images, is a key step to developing accurate +object detection algorithms using computer vision. This step is often not +compatible with many cloud-based services for marking or labeling image and +video data due to limited internet bandwidth in many regions of the world. +Desktop tools are useful for groups working in remote locations, but users +often do not have the capability to combine projects developed locally by +multiple collaborators. Furthermore, many tools offer features for labeling +data or using pre-trained models for classification, but few allow researchers +to combine these steps to create and apply custom models. Free, open-source, +and user-friendly software that offers a full suite of features (e.g., ability +to work locally and online, and train custom models) is desirable to field +researchers and conservationists that may have limited coding skills. We +developed Njobvu-AI, a free, open-source tool that can be run on both desktop +and server hardware using Node.js, allowing users to label data, combine +projects for collaboration and review, train custom algorithms, and implement +new computer vision models. The name Njobvu-AI (pronounced N-joh-voo AI), +incorporating the Chichewa word for elephant, is inspired by a wildlife +monitoring program in Malawi that was a primary impetus for the development of +this tool and references similarities between the powerful memory of elephants +and properties of computer vision models. + +
+
+ comment: 13 pages, 6 figures. For code and documentation, see + https://github.com/sullichrosu/Njobvu-AI/ +
+
+
+
+
+ + ☆ Deformation Robust Text Spotting with Geometric Prior + + +
+ The goal of text spotting is to perform text detection and recognition in an +end-to-end manner. Although the diversity of luminosity and orientation in +scene texts has been widely studied, the font diversity and shape variance of +the same character are ignored in recent works, since most characters in +natural images are rendered in standard fonts. To solve this problem, we +present a Chinese Artistic Dataset, termed as ARText, which contains 33,000 +artistic images with rich shape deformation and font diversity. Based on this +database, we develop a deformation robust text spotting method (DR TextSpotter) +to solve the recognition problem of complex deformation of characters in +different fonts. Specifically, we propose a geometric prior module to highlight +the important features based on the unsupervised landmark detection +sub-network. A graph convolution network is further constructed to fuse the +character features and landmark features, and then performs semantic reasoning +to enhance the discrimination for different characters. The experiments are +conducted on ARText and IC19-ReCTS datasets. Our results demonstrate the +effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ RGB-T Tracking via Multi-Modal Mutual Prompt Learning + + +
+ Object tracking based on the fusion of visible and thermal im-ages, known as +RGB-T tracking, has gained increasing atten-tion from researchers in recent +years. How to achieve a more comprehensive fusion of information from the two +modalities with fewer computational costs has been a problem that re-searchers +have been exploring. Recently, with the rise of prompt learning in computer +vision, we can better transfer knowledge from visual large models to downstream +tasks. Considering the strong complementarity between visible and thermal +modalities, we propose a tracking architecture based on mutual prompt learning +between the two modalities. We also design a lightweight prompter that +incorporates attention mechanisms in two dimensions to transfer information +from one modality to the other with lower computational costs, embedding it +into each layer of the backbone. Extensive ex-periments have demonstrated that +our proposed tracking ar-chitecture is effective and efficient, achieving +state-of-the-art performance while maintaining high running speeds. + +
+
+ comment: 9 pages, 5 figures, 5 tables +
+
+
+
+
+ + ☆ Separate and Locate: Rethink the Text in Text-based Visual Question + Answering ACM MM 2023 + + +
+ Text-based Visual Question Answering (TextVQA) aims at answering questions +about the text in images. Most works in this field focus on designing network +structures or pre-training tasks. All these methods list the OCR texts in +reading order (from left to right and top to bottom) to form a sequence, which +is treated as a natural language ``sentence''. However, they ignore the fact +that most OCR words in the TextVQA task do not have a semantical contextual +relationship. In addition, these approaches use 1-D position embedding to +construct the spatial relation between OCR tokens sequentially, which is not +reasonable. The 1-D position embedding can only represent the left-right +sequence relationship between words in a sentence, but not the complex spatial +position relationship. To tackle these problems, we propose a novel method +named Separate and Locate (SaL) that explores text contextual cues and designs +spatial position embedding to construct spatial relations between OCR texts. +Specifically, we propose a Text Semantic Separate (TSS) module that helps the +model recognize whether words have semantic contextual relations. Then, we +introduce a Spatial Circle Position (SCP) module that helps the model better +construct and reason the spatial position relationships between OCR texts. Our +SaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA +and ST-VQA datasets. Compared with the pre-training state-of-the-art method +pre-trained on 64 million pre-training samples, our method, without any +pre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on +TextVQA and ST-VQA. Our code and models will be released at +https://github.com/fangbufang/SaL. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ 3D vision-based structural masonry damage detection + + +
+ The detection of masonry damage is essential for preventing potentially +disastrous outcomes. Manual inspection can, however, take a long time and be +hazardous to human inspectors. Automation of the inspection process using novel +computer vision and machine learning algorithms can be a more efficient and +safe solution to prevent further deterioration of the masonry structures. Most +existing 2D vision-based methods are limited to qualitative damage +classification, 2D localization, and in-plane quantification. In this study, we +present a 3D vision-based methodology for accurate masonry damage detection, +which offers a more robust solution with a greater field of view, depth of +vision, and the ability to detect failures in complex environments. First, +images of the masonry specimens are collected to generate a 3D point cloud. +Second, 3D point clouds processing methods are developed to evaluate the +masonry damage. We demonstrate the effectiveness of our approach through +experiments on structural masonry components. Our experiments showed the +proposed system can effectively classify damage states and localize and +quantify critical damage features. The result showed the proposed method can +improve the level of autonomy during the inspection of masonry structures. + +
+
+ comment: 10 pages, accepted in the Canadian Conference - Pacific Conference on + Earthquake Engineering 2023, Vancouver, British Columbia +
+
+
+
+
+ + ☆ Improving Multiple Sclerosis Lesion Segmentation Across Clinical Sites: + A Federated Learning Approach with Noise-Resilient Training + + +
+ Accurately measuring the evolution of Multiple Sclerosis (MS) with magnetic +resonance imaging (MRI) critically informs understanding of disease progression +and helps to direct therapeutic strategy. Deep learning models have shown +promise for automatically segmenting MS lesions, but the scarcity of accurately +annotated data hinders progress in this area. Obtaining sufficient data from a +single clinical site is challenging and does not address the heterogeneous need +for model robustness. Conversely, the collection of data from multiple sites +introduces data privacy concerns and potential label noise due to varying +annotation standards. To address this dilemma, we explore the use of the +federated learning framework while considering label noise. Our approach +enables collaboration among multiple clinical sites without compromising data +privacy under a federated learning paradigm that incorporates a noise-robust +training strategy based on label correction. Specifically, we introduce a +Decoupled Hard Label Correction (DHLC) strategy that considers the imbalanced +distribution and fuzzy boundaries of MS lesions, enabling the correction of +false annotations based on prediction confidence. We also introduce a Centrally +Enhanced Label Correction (CELC) strategy, which leverages the aggregated +central model as a correction teacher for all sites, enhancing the reliability +of the correction process. Extensive experiments conducted on two multi-site +datasets demonstrate the effectiveness and robustness of our proposed methods, +indicating their potential for clinical applications in multi-site +collaborations. + +
+
+ comment: 11 pages, 4 figures, journal submission +
+
+
+
+
+ + ☆ Typing on Any Surface: A Deep Learning-based Method for Real-Time + Keystroke Detection in Augmented Reality + + +
+ Frustrating text entry interface has been a major obstacle in participating +in social activities in augmented reality (AR). Popular options, such as +mid-air keyboard interface, wireless keyboards or voice input, either suffer +from poor ergonomic design, limited accuracy, or are simply embarrassing to use +in public. This paper proposes and validates a deep-learning based approach, +that enables AR applications to accurately predict keystrokes from the user +perspective RGB video stream that can be captured by any AR headset. This +enables a user to perform typing activities on any flat surface and eliminates +the need of a physical or virtual keyboard. A two-stage model, combing an +off-the-shelf hand landmark extractor and a novel adaptive Convolutional +Recurrent Neural Network (C-RNN), was trained using our newly built dataset. +The final model was capable of adaptive processing user-perspective video +streams at ~32 FPS. This base model achieved an overall accuracy of $91.05\%$ +when typing 40 Words per Minute (wpm), which is how fast an average person +types with two hands on a physical keyboard. The Normalised Levenshtein +Distance also further confirmed the real-world applicability of that our +approach. The promising results highlight the viability of our approach and the +potential for our method to be integrated into various applications. We also +discussed the limitations and future research required to bring such technique +into a production system. + +
+
+
+
+
+ + ☆ Pose-Graph Attentional Graph Neural Network for Lidar Place Recognition + + +
+ This paper proposes a lidar place recognition approach, called P-GAT, to +increase the receptive field between point clouds captured over time. Instead +of comparing pairs of point clouds, we compare the similarity between sets of +point clouds to use the maximum spatial and temporal information between +neighbour clouds utilising the concept of pose-graph SLAM. Leveraging intra- +and inter-attention and graph neural network, P-GAT relates point clouds +captured in nearby locations in Euclidean space and their embeddings in feature +space. Experimental results on the large-scale publically available datasets +demonstrate the effectiveness of our approach in recognising scenes lacking +distinct features and when training and testing environments have different +distributions (domain adaptation). Further, an exhaustive comparison with the +state-of-the-art shows improvements in performance gains. Code will be +available upon acceptance. + +
+
+ comment: 8 pages, 3 figures, 5 tables +
+
+
+
+
+ + ☆ BuilDiff: 3D Building Shape Generation using Single-Image Conditional + Point Cloud Diffusion Models ICCV + + +
+ 3D building generation with low data acquisition costs, such as single +image-to-3D, becomes increasingly important. However, most of the existing +single image-to-3D building creation works are restricted to those images with +specific viewing angles, hence they are difficult to scale to general-view +images that commonly appear in practical cases. To fill this gap, we propose a +novel 3D building shape generation method exploiting point cloud diffusion +models with image conditioning schemes, which demonstrates flexibility to the +input images. By cooperating two conditional diffusion models and introducing a +regularization strategy during denoising process, our method is able to +synthesize building roofs while maintaining the overall structures. We validate +our framework on two newly built datasets and extensive experiments show that +our method outperforms previous works in terms of building generation quality. + +
+
+ comment: 10 pages, 6 figures, accepted to ICCVW2023 +
+
+
+
+
+ + ☆ Optimized Deep Feature Selection for Pneumonia Detection: A Novel RegNet + and XOR-Based PSO Approach + + +
+ Pneumonia remains a significant cause of child mortality, particularly in +developing countries where resources and expertise are limited. The automated +detection of Pneumonia can greatly assist in addressing this challenge. In this +research, an XOR based Particle Swarm Optimization (PSO) is proposed to select +deep features from the second last layer of a RegNet model, aiming to improve +the accuracy of the CNN model on Pneumonia detection. The proposed XOR PSO +algorithm offers simplicity by incorporating just one hyperparameter for +initialization, and each iteration requires minimal computation time. Moreover, +it achieves a balance between exploration and exploitation, leading to +convergence on a suitable solution. By extracting 163 features, an impressive +accuracy level of 98% was attained which demonstrates comparable accuracy to +previous PSO-based methods. The source code of the proposed method is available +in the GitHub repository. + +
+
+
+
+
+ + ☆ Self-supervised Semantic Segmentation: Consistency over Transformation ICCV 2023 + + +
+ Accurate medical image segmentation is of utmost importance for enabling +automated clinical decision procedures. However, prevailing supervised deep +learning approaches for medical image segmentation encounter significant +challenges due to their heavy dependence on extensive labeled training data. To +tackle this issue, we propose a novel self-supervised algorithm, +\textbf{S$^3$-Net}, which integrates a robust framework based on the proposed +Inception Large Kernel Attention (I-LKA) modules. This architectural +enhancement makes it possible to comprehensively capture contextual information +while preserving local intricacies, thereby enabling precise semantic +segmentation. Furthermore, considering that lesions in medical images often +exhibit deformations, we leverage deformable convolution as an integral +component to effectively capture and delineate lesion deformations for superior +object boundary definition. Additionally, our self-supervised strategy +emphasizes the acquisition of invariance to affine transformations, which is +commonly encountered in medical scenarios. This emphasis on robustness with +respect to geometric distortions significantly enhances the model's ability to +accurately model and handle such distortions. To enforce spatial consistency +and promote the grouping of spatially connected image pixels with similar +feature representations, we introduce a spatial consistency loss term. This +aids the network in effectively capturing the relationships among neighboring +pixels and enhancing the overall segmentation quality. The S$^3$-Net approach +iteratively learns pixel-level feature representations for image content +clustering in an end-to-end manner. Our experimental results on skin lesion and +lung organ segmentation tasks show the superior performance of our method +compared to the SOTA approaches. https://github.com/mindflow-institue/SSCT + +
+
+ comment: Accepted in ICCV 2023 workshop CVAMD +
+
+
+
+
+ + ☆ Improving vision-inspired keyword spotting using dynamic module skipping + in streaming conformer encoder + + +
+ Using a vision-inspired keyword spotting framework, we propose an +architecture with input-dependent dynamic depth capable of processing streaming +audio. Specifically, we extend a conformer encoder with trainable binary gates +that allow us to dynamically skip network modules according to the input audio. +Our approach improves detection and localization accuracy on continuous speech +using Librispeech top-1000 most frequent words while maintaining a small memory +footprint. The inclusion of gates also reduces the average amount of processing +without affecting the overall performance. These benefits are shown to be even +more pronounced using the Google speech commands dataset placed over background +noise where up to 97% of the processing is skipped on non-speech inputs, +therefore making our method particularly interesting for an always-on keyword +spotter. + +
+
+
+
+
+ + ☆ Distraction-free Embeddings for Robust VQA + + +
+ The generation of effective latent representations and their subsequent +refinement to incorporate precise information is an essential prerequisite for +Vision-Language Understanding (VLU) tasks such as Video Question Answering +(VQA). However, most existing methods for VLU focus on sparsely sampling or +fine-graining the input information (e.g., sampling a sparse set of frames or +text tokens), or adding external knowledge. We present a novel "DRAX: +Distraction Removal and Attended Cross-Alignment" method to rid our cross-modal +representations of distractors in the latent space. We do not exclusively +confine the perception of any input information from various modalities but +instead use an attention-guided distraction removal method to increase focus on +task-relevant information in latent embeddings. DRAX also ensures semantic +alignment of embeddings during cross-modal fusions. We evaluate our approach on +a challenging benchmark (SUTD-TrafficQA dataset), testing the framework's +abilities for feature and event queries, temporal relation understanding, +forecasting, hypothesis, and causal analysis through extensive experiments. + +
+
+
+
+
+ + ☆ Segmentação e contagem de troncos de madeira utilizando deep + learning e processamento de imagens + + +
+ Counting objects in images is a pattern recognition problem that focuses on +identifying an element to determine its incidence and is approached in the +literature as Visual Object Counting (VOC). In this work, we propose a +methodology to count wood logs. First, wood logs are segmented from the image +background. This first segmentation step is obtained using the Pix2Pix +framework that implements Conditional Generative Adversarial Networks (CGANs). +Second, the clusters are counted using Connected Components. The average +accuracy of the segmentation exceeds 89% while the average amount of wood logs +identified based on total accounted is over 97%. + +
+
+ comment: in Portuguese language, International Conference on Production + Engineering - Americas 2022 +
+
+
+
+
+ + ☆ Beyond Self-Attention: Deformable Large Kernel Attention for Medical + Image Segmentation + + +
+ Medical image segmentation has seen significant improvements with transformer +models, which excel in grasping far-reaching contexts and global contextual +information. However, the increasing computational demands of these models, +proportional to the squared token count, limit their depth and resolution +capabilities. Most current methods process D volumetric image data +slice-by-slice (called pseudo 3D), missing crucial inter-slice information and +thus reducing the model's overall performance. To address these challenges, we +introduce the concept of \textbf{Deformable Large Kernel Attention (D-LKA +Attention)}, a streamlined attention mechanism employing large convolution +kernels to fully appreciate volumetric context. This mechanism operates within +a receptive field akin to self-attention while sidestepping the computational +overhead. Additionally, our proposed attention mechanism benefits from +deformable convolutions to flexibly warp the sampling grid, enabling the model +to adapt appropriately to diverse data patterns. We designed both 2D and 3D +adaptations of the D-LKA Attention, with the latter excelling in cross-depth +data understanding. Together, these components shape our novel hierarchical +Vision Transformer architecture, the \textit{D-LKA Net}. Evaluations of our +model against leading methods on popular medical segmentation datasets +(Synapse, NIH Pancreas, and Skin lesion) demonstrate its superior performance. +Our code implementation is publicly available at the: +https://github.com/mindflow-institue/deformableLKA + +
+
+
+
+
+ + ☆ Laplacian-Former: Overcoming the Limitations of Vision Transformers in + Local Texture Detection MICCAI 2023 + + +
+ Vision Transformer (ViT) models have demonstrated a breakthrough in a wide +range of computer vision tasks. However, compared to the Convolutional Neural +Network (CNN) models, it has been observed that the ViT models struggle to +capture high-frequency components of images, which can limit their ability to +detect local textures and edge information. As abnormalities in human tissue, +such as tumors and lesions, may greatly vary in structure, texture, and shape, +high-frequency information such as texture is crucial for effective semantic +segmentation tasks. To address this limitation in ViT models, we propose a new +technique, Laplacian-Former, that enhances the self-attention map by adaptively +re-calibrating the frequency information in a Laplacian pyramid. More +specifically, our proposed method utilizes a dual attention mechanism via +efficient attention and frequency attention while the efficient attention +mechanism reduces the complexity of self-attention to linear while producing +the same output, selectively intensifying the contribution of shape and texture +features. Furthermore, we introduce a novel efficient enhancement multi-scale +bridge that effectively transfers spatial information from the encoder to the +decoder while preserving the fundamental features. We demonstrate the efficacy +of Laplacian-former on multi-organ and skin lesion segmentation tasks with ++1.87\% and +0.76\% dice scores compared to SOTA approaches, respectively. Our +implementation is publically available at +https://github.com/mindflow-institue/Laplacian-Former + +
+
+ comment: Accepted in the main conference MICCAI 2023 +
+
+
+
+
+ + ☆ Unsupervised evaluation of GAN sample quality: Introducing the TTJac + Score + + +
+ Evaluation metrics are essential for assessing the performance of generative +models in image synthesis. However, existing metrics often involve high memory +and time consumption as they compute the distance between generated samples and +real data points. In our study, the new evaluation metric called the "TTJac +score" is proposed to measure the fidelity of individual synthesized images in +a data-free manner. The study first establishes a theoretical approach to +directly evaluate the generated sample density. Then, a method incorporating +feature extractors and discrete function approximation through tensor train is +introduced to effectively assess the quality of generated samples. Furthermore, +the study demonstrates that this new metric can be used to improve the +fidelity-variability trade-off when applying the truncation trick. The +experimental results of applying the proposed metric to StyleGAN 2 and StyleGAN +2 ADA models on FFHQ, AFHQ-Wild, LSUN-Cars, and LSUN-Horse datasets are +presented. The code used in this research will be made publicly available +online for the research community to access and utilize. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Open-Vocabulary Semantic Segmentation via Attribute + Decomposition-Aggregation + + +
+ Open-vocabulary semantic segmentation is a challenging task that requires +segmenting novel object categories at inference time. Recent works explore +vision-language pre-training to handle this task, but suffer from unrealistic +assumptions in practical scenarios, i.e., low-quality textual category names. +For example, this paradigm assumes that new textual categories will be +accurately and completely provided, and exist in lexicons during pre-training. +However, exceptions often happen when meet with ambiguity for brief or +incomplete names, new words that are not present in the pre-trained lexicons, +and difficult-to-describe categories for users. To address these issues, this +work proposes a novel decomposition-aggregation framework, inspired by human +cognition in understanding new concepts. Specifically, in the decomposition +stage, we decouple class names into diverse attribute descriptions to enrich +semantic contexts. Two attribute construction strategies are designed: using +large language models for common categories, and involving manually labelling +for human-invented categories. In the aggregation stage, we group diverse +attributes into an integrated global description, to form a discriminative +classifier that distinguishes the target object from others. One hierarchical +aggregation is further designed to achieve multi-level alignment and deep +fusion between vision and text. The final result is obtained by computing the +embedding similarity between aggregated attributes and images. To evaluate the +effectiveness, we annotate three datasets with attribute descriptions, and +conduct extensive experiments and ablation studies. The results show the +superior performance of attribute decomposition-aggregation. + +
+
+
+
+
+ + ☆ Few-shot Diagnosis of Chest x-rays Using an Ensemble of Random + Discriminative Subspaces ICLR + + +
+ Due to the scarcity of annotated data in the medical domain, few-shot +learning may be useful for medical image analysis tasks. We design a few-shot +learning method using an ensemble of random subspaces for the diagnosis of +chest x-rays (CXRs). Our design is computationally efficient and almost 1.8 +times faster than method that uses the popular truncated singular value +decomposition (t-SVD) for subspace decomposition. The proposed method is +trained by minimizing a novel loss function that helps create well-separated +clusters of training data in discriminative subspaces. As a result, minimizing +the loss maximizes the distance between the subspaces, making them +discriminative and assisting in better classification. Experiments on +large-scale publicly available CXR datasets yield promising results. Code for +the project will be available at +https://github.com/Few-shot-Learning-on-chest-x-ray/fsl_subspace. + +
+
+ comment: ICLR MLGH Workshop 2023 +
+
+
+
+
+ + ☆ SoDaCam: Software-defined Cameras via Single-Photon Imaging ICCV 2023 + + +
+ Reinterpretable cameras are defined by their post-processing capabilities +that exceed traditional imaging. We present "SoDaCam" that provides +reinterpretable cameras at the granularity of photons, from photon-cubes +acquired by single-photon devices. Photon-cubes represent the spatio-temporal +detections of photons as a sequence of binary frames, at frame-rates as high as +100 kHz. We show that simple transformations of the photon-cube, or photon-cube +projections, provide the functionality of numerous imaging systems including: +exposure bracketing, flutter shutter cameras, video compressive systems, event +cameras, and even cameras that move during exposure. Our photon-cube +projections offer the flexibility of being software-defined constructs that are +only limited by what is computable, and shot-noise. We exploit this flexibility +to provide new capabilities for the emulated cameras. As an added benefit, our +projections provide camera-dependent compression of photon-cubes, which we +demonstrate using an implementation of our projections on a novel compute +architecture that is designed for single-photon imaging. + +
+
+ comment: Accepted at ICCV 2023 (oral). Project webpage can be found at + https://wisionlab.com/project/sodacam/ +
+
+
+
+
+ + ☆ STint: Self-supervised Temporal Interpolation for Geospatial Data + + +
+ Supervised and unsupervised techniques have demonstrated the potential for +temporal interpolation of video data. Nevertheless, most prevailing temporal +interpolation techniques hinge on optical flow, which encodes the motion of +pixels between video frames. On the other hand, geospatial data exhibits lower +temporal resolution while encompassing a spectrum of movements and deformations +that challenge several assumptions inherent to optical flow. In this work, we +propose an unsupervised temporal interpolation technique, which does not rely +on ground truth data or require any motion information like optical flow, thus +offering a promising alternative for better generalization across geospatial +domains. Specifically, we introduce a self-supervised technique of dual cycle +consistency. Our proposed technique incorporates multiple cycle consistency +losses, which result from interpolating two frames between consecutive input +frames through a series of stages. This dual cycle consistent constraint causes +the model to produce intermediate frames in a self-supervised manner. To the +best of our knowledge, this is the first attempt at unsupervised temporal +interpolation without the explicit use of optical flow. Our experimental +evaluations across diverse geospatial datasets show that STint significantly +outperforms existing state-of-the-art methods for unsupervised temporal +interpolation. + +
+
+
+
+
+ + ☆ Bellybutton: Accessible and Customizable Deep-Learning Image + Segmentation + + +
+ The conversion of raw images into quantifiable data can be a major hurdle in +experimental research, and typically involves identifying region(s) of +interest, a process known as segmentation. Machine learning tools for image +segmentation are often specific to a set of tasks, such as tracking cells, or +require substantial compute or coding knowledge to train and use. Here we +introduce an easy-to-use (no coding required), image segmentation method, using +a 15-layer convolutional neural network that can be trained on a laptop: +Bellybutton. The algorithm trains on user-provided segmentation of example +images, but, as we show, just one or even a portion of one training image can +be sufficient in some cases. We detail the machine learning method and give +three use cases where Bellybutton correctly segments images despite substantial +lighting, shape, size, focus, and/or structure variation across the regions(s) +of interest. Instructions for easy download and use, with further details and +the datasets used in this paper are available at +pypi.org/project/Bellybuttonseg. + +
+
+ comment: 6 Pages 3 Figures +
+
+
+
+
+ + ☆ FACET: Fairness in Computer Vision Evaluation Benchmark + + +
+ Computer vision models have known performance disparities across attributes +such as gender and skin tone. This means during tasks such as classification +and detection, model performance differs for certain classes based on the +demographics of the people in the image. These disparities have been shown to +exist, but until now there has not been a unified approach to measure these +differences for common use-cases of computer vision models. We present a new +benchmark named FACET (FAirness in Computer Vision EvaluaTion), a large, +publicly available evaluation set of 32k images for some of the most common +vision tasks - image classification, object detection and segmentation. For +every image in FACET, we hired expert reviewers to manually annotate +person-related attributes such as perceived skin tone and hair type, manually +draw bounding boxes and label fine-grained person-related classes such as disk +jockey or guitarist. In addition, we use FACET to benchmark state-of-the-art +vision models and present a deeper understanding of potential performance +disparities and challenges across sensitive demographic attributes. With the +exhaustive annotations collected, we probe models using single demographics +attributes as well as multiple attributes using an intersectional approach +(e.g. hair color and perceived skin tone). Our results show that +classification, detection, segmentation, and visual grounding models exhibit +performance disparities across demographic attributes and intersections of +attributes. These harms suggest that not all people represented in datasets +receive fair and equitable treatment in these vision tasks. We hope current and +future results using our benchmark will contribute to fairer, more robust +vision models. FACET is available publicly at https://facet.metademolab.com/ + +
+
+
+
+
+ + ☆ Audio-Driven Dubbing for User Generated Contents via Style-Aware + Semi-Parametric Synthesis + + +
+ Existing automated dubbing methods are usually designed for Professionally +Generated Content (PGC) production, which requires massive training data and +training time to learn a person-specific audio-video mapping. In this paper, we +investigate an audio-driven dubbing method that is more feasible for User +Generated Content (UGC) production. There are two unique challenges to design a +method for UGC: 1) the appearances of speakers are diverse and arbitrary as the +method needs to generalize across users; 2) the available video data of one +speaker are very limited. In order to tackle the above challenges, we first +introduce a new Style Translation Network to integrate the speaking style of +the target and the speaking content of the source via a cross-modal AdaIN +module. It enables our model to quickly adapt to a new speaker. Then, we +further develop a semi-parametric video renderer, which takes full advantage of +the limited training data of the unseen speaker via a video-level +retrieve-warp-refine pipeline. Finally, we propose a temporal regularization +for the semi-parametric renderer, generating more continuous videos. Extensive +experiments show that our method generates videos that accurately preserve +various speaking styles, yet with considerably lower amount of training data +and training time in comparison to existing methods. Besides, our method +achieves a faster testing speed than most recent methods. + +
+
+ comment: TCSVT 2022 +
+
+
+
+
+ + ☆ Vision-Based Cranberry Crop Ripening Assessment + + +
+ Agricultural domains are being transformed by recent advances in AI and +computer vision that support quantitative visual evaluation. Using drone +imaging, we develop a framework for characterizing the ripening process of +cranberry crops. Our method consists of drone-based time-series collection over +a cranberry growing season, photometric calibration for albedo recovery from +pixels, and berry segmentation with semi-supervised deep learning networks +using point-click annotations. By extracting time-series berry albedo +measurements, we evaluate four different varieties of cranberries and provide a +quantification of their ripening rates. Such quantification has practical +implications for 1) assessing real-time overheating risks for cranberry bogs; +2) large scale comparisons of progeny in crop breeding; 3) detecting disease by +looking for ripening pattern outliers. This work is the first of its kind in +quantitative evaluation of ripening using computer vision methods and has +impact beyond cranberry crops including wine grapes, olives, blueberries, and +maize. + +
+
+
+
+
+ + ☆ A Sequential Framework for Detection and Classification of Abnormal + Teeth in Panoramic X-rays + + +
+ This paper describes our solution for the Dental Enumeration and Diagnosis on +Panoramic X-rays Challenge at MICCAI 2023. Our approach consists of a +multi-step framework tailored to the task of detecting and classifying abnormal +teeth. The solution includes three sequential stages: dental instance +detection, healthy instance filtering, and abnormal instance classification. In +the first stage, we employed a Faster-RCNN model for detecting and identifying +teeth. In subsequent stages, we designed a model that merged the encoding +pathway of a pretrained U-net, optimized for dental lesion detection, with the +Vgg16 architecture. The resulting model was first used for filtering out +healthy teeth. Then, any identified abnormal teeth were categorized, +potentially falling into one or more of the following conditions: embeddded, +periapical lesion, caries, deep caries. The model performing dental instance +detection achieved an AP score of 0.49. The model responsible for identifying +healthy teeth attained an F1 score of 0.71. Meanwhile, the model trained for +multi-label dental disease classification achieved an F1 score of 0.76. The +code is available at +https://github.com/tudordascalu/2d-teeth-detection-challenge. + +
+
+
+
+
+ + ♻ ☆ Motion Matters: Neural Motion Transfer for Better Camera Physiological + Measurement + + +
+ Machine learning models for camera-based physiological measurement can have +weak generalization due to a lack of representative training data. Body motion +is one of the most significant sources of noise when attempting to recover the +subtle cardiac pulse from a video. We explore motion transfer as a form of data +augmentation to introduce motion variation while preserving physiological +changes of interest. We adapt a neural video synthesis approach to augment +videos for the task of remote photoplethysmography (rPPG) and study the effects +of motion augmentation with respect to 1) the magnitude and 2) the type of +motion. After training on motion-augmented versions of publicly available +datasets, we demonstrate a 47% improvement over existing inter-dataset results +using various state-of-the-art methods on the PURE dataset. We also present +inter-dataset results on five benchmark datasets to show improvements of up to +79% using TS-CAN, a neural rPPG estimation method. Our findings illustrate the +usefulness of motion transfer as a data augmentation technique for improving +the generalization of models for camera-based physiological sensing. We release +our code for using motion transfer as a data augmentation technique on three +publicly available datasets, UBFC-rPPG, PURE, and SCAMPS, and models +pre-trained on motion-augmented data here: https://motion-matters.github.io/ + +
+
+ comment: 17 pages, 6 figures, 15 tables +
+
+
+
+
+ + ♻ ☆ StyleGAN as a Utility-Preserving Face De-identification Method + + +
+ Face de-identification methods have been proposed to preserve users' privacy +by obscuring their faces. These methods, however, can degrade the quality of +photos, and they usually do not preserve the utility of faces, i.e., their age, +gender, pose, and facial expression. Recently, GANs, such as StyleGAN, have +been proposed, which generate realistic, high-quality imaginary faces. In this +paper, we investigate the use of StyleGAN in generating de-identified faces +through style mixing. We examined this de-identification method for preserving +utility and privacy by implementing several face detection, verification, and +identification attacks and conducting a user study. The results from our +extensive experiments, human evaluation, and comparison with two +state-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN +performs on par or better than these methods, preserving users' privacy and +images' utility. In particular, the results of the machine learning-based +experiments show that StyleGAN0-4 preserves utility better than CIAGAN and +DeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves +utility at the same level while providing more privacy. In this paper, for the +first time, we also performed a carefully designed user study to examine both +privacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well +as CIAGAN and DeepPrivacy from the human observers' perspectives. Our +statistical tests showed that participants tend to verify and identify +StyleGAN0-5 images more easily than DeepPrivacy images. All the methods but +StyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding +utility, as expected, StyleGAN0-5 performed significantly better in preserving +some attributes. Among all methods, on average, participants believe gender has +been preserved the most while naturalness has been preserved the least. + +
+
+
+
+
+ + ♻ ☆ Humans in 4D: Reconstructing and Tracking Humans with Transformers ICCV 2023 + + +
+ We present an approach to reconstruct humans and track them over time. At the +core of our approach, we propose a fully "transformerized" version of a network +for human mesh recovery. This network, HMR 2.0, advances the state of the art +and shows the capability to analyze unusual poses that have in the past been +difficult to reconstruct from single images. To analyze video, we use 3D +reconstructions from HMR 2.0 as input to a tracking system that operates in 3D. +This enables us to deal with multiple people and maintain identities through +occlusion events. Our complete approach, 4DHumans, achieves state-of-the-art +results for tracking people from monocular video. Furthermore, we demonstrate +the effectiveness of HMR 2.0 on the downstream task of action recognition, +achieving significant improvements over previous pose-based action recognition +approaches. Our code and models are available on the project website: +https://shubham-goel.github.io/4dhumans/. + +
+
+ comment: In ICCV 2023. Project Webpage: + https://shubham-goel.github.io/4dhumans/ +
+
+
+
+
+ + ♻ ☆ Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave + Communications + + +
+ This study demonstrates the feasibility of point cloud-based proactive link +quality prediction for millimeter-wave (mmWave) communications. Previous +studies have proposed machine learning-based methods to predict received signal +strength for future time periods using time series of depth images to mitigate +the line-of-sight (LOS) path blockage by pedestrians in mmWave communication. +However, these image-based methods have limited applicability due to privacy +concerns as camera images may contain sensitive information. This study +proposes a point cloud-based method for mmWave link quality prediction and +demonstrates its feasibility through experiments. Point clouds represent +three-dimensional (3D) spaces as a set of points and are sparser and less +likely to contain sensitive information than camera images. Additionally, point +clouds provide 3D position and motion information, which is necessary for +understanding the radio propagation environment involving pedestrians. This +study designs the mmWave link quality prediction method and conducts realistic +indoor experiments, where the link quality fluctuates significantly due to +human blockage, using commercially available IEEE 802.11ad-based 60 GHz +wireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light +detection and ranging (LiDAR) for point cloud acquisition. The experimental +results showed that our proposed method can predict future large attenuation of +mmWave received signal strength and throughput induced by the LOS path blockage +by pedestrians with comparable or superior accuracy to image-based prediction +methods. Hence, our point cloud-based method can serve as a viable alternative +to image-based methods. + +
+
+ comment: Submitted to IEEE Transactions on Machine Learning in Communications + and Networking +
+
+
+
+
+ + ♻ ☆ RoboTAP: Tracking Arbitrary Points for Few-Shot Visual Imitation + + +
+ For robots to be useful outside labs and specialized factories we need a way +to teach them new useful behaviors quickly. Current approaches lack either the +generality to onboard new tasks without task-specific engineering, or else lack +the data-efficiency to do so in an amount of time that enables practical use. +In this work we explore dense tracking as a representational vehicle to allow +faster and more general learning from demonstration. Our approach utilizes +Track-Any-Point (TAP) models to isolate the relevant motion in a demonstration, +and parameterize a low-level controller to reproduce this motion across changes +in the scene configuration. We show this results in robust robot policies that +can solve complex object-arrangement tasks such as shape-matching, stacking, +and even full path-following tasks such as applying glue and sticking objects +together, all from demonstrations that can be collected in minutes. + +
+
+ comment: Project website: https://robotap.github.io +
+
+
+
+
+ + ♻ ☆ 6D Object Pose Estimation from Approximate 3D Models for Orbital + Robotics IROS + + +
+ We present a novel technique to estimate the 6D pose of objects from single +images where the 3D geometry of the object is only given approximately and not +as a precise 3D model. To achieve this, we employ a dense 2D-to-3D +correspondence predictor that regresses 3D model coordinates for every pixel. +In addition to the 3D coordinates, our model also estimates the pixel-wise +coordinate error to discard correspondences that are likely wrong. This allows +us to generate multiple 6D pose hypotheses of the object, which we then refine +iteratively using a highly efficient region-based approach. We also introduce a +novel pixel-wise posterior formulation by which we can estimate the probability +for each hypothesis and select the most likely one. As we show in experiments, +our approach is capable of dealing with extreme visual conditions including +overexposure, high contrast, or low signal-to-noise ratio. This makes it a +powerful technique for the particularly challenging task of estimating the pose +of tumbling satellites for in-orbit robotic applications. Our method achieves +state-of-the-art performance on the SPEED+ dataset and has won the SPEC2021 +post-mortem competition. + +
+
+ comment: Proceedings of IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS) +
+
+
+
+
+ + ♻ ☆ DUFormer: Solving Power Line Detection Task in Aerial Images using + Semantic Segmentation + + +
+ Unmanned aerial vehicles (UAVs) are frequently used for inspecting power +lines and capturing high-resolution aerial images. However, detecting power +lines in aerial images is difficult,as the foreground data(i.e, power lines) is +small and the background information is abundant.To tackle this problem, we +introduce DUFormer, a semantic segmentation algorithm explicitly designed to +detect power lines in aerial images. We presuppose that it is advantageous to +train an efficient Transformer model with sufficient feature extraction using a +convolutional neural network(CNN) with a strong inductive bias.With this goal +in mind, we introduce a heavy token encoder that performs overlapping feature +remodeling and tokenization. The encoder comprises a pyramid CNN feature +extraction module and a power line feature enhancement module.After successful +local feature extraction for power lines, feature fusion is conducted.Then,the +Transformer block is used for global modeling. The final segmentation result is +achieved by amalgamating local and global features in the decode head.Moreover, +we demonstrate the importance of the joint multi-weight loss function in power +line segmentation. Our experimental results show that our proposed method +outperforms all state-of-the-art methods in power line segmentation on the +publicly accessible TTPLA dataset. + +
+
+
+
+
+ + ♻ ☆ RBSR: Efficient and Flexible Recurrent Network for Burst + Super-Resolution + + +
+ Burst super-resolution (BurstSR) aims at reconstructing a high-resolution +(HR) image from a sequence of low-resolution (LR) and noisy images, which is +conducive to enhancing the imaging effects of smartphones with limited sensors. +The main challenge of BurstSR is to effectively combine the complementary +information from input frames, while existing methods still struggle with it. +In this paper, we suggest fusing cues frame-by-frame with an efficient and +flexible recurrent network. In particular, we emphasize the role of the +base-frame and utilize it as a key prompt to guide the knowledge acquisition +from other frames in every recurrence. Moreover, we introduce an implicit +weighting loss to improve the model's flexibility in facing input frames with +variable numbers. Extensive experiments on both synthetic and real-world +datasets demonstrate that our method achieves better results than +state-of-the-art ones. Codes and pre-trained models are available at +https://github.com/ZcsrenlongZ/RBSR. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ IML-ViT: Benchmarking Image Manipulation Localization by Vision + Transformer + + +
+ Advanced image tampering techniques are increasingly challenging the +trustworthiness of multimedia, leading to the development of Image Manipulation +Localization (IML). But what makes a good IML model? The answer lies in the way +to capture artifacts. Exploiting artifacts requires the model to extract +non-semantic discrepancies between manipulated and authentic regions, +necessitating explicit comparisons between the two areas. With the +self-attention mechanism, naturally, the Transformer should be a better +candidate to capture artifacts. However, due to limited datasets, there is +currently no pure ViT-based approach for IML to serve as a benchmark, and CNNs +dominate the entire task. Nevertheless, CNNs suffer from weak long-range and +non-semantic modeling. To bridge this gap, based on the fact that artifacts are +sensitive to image resolution, amplified under multi-scale features, and +massive at the manipulation border, we formulate the answer to the former +question as building a ViT with high-resolution capacity, multi-scale feature +extraction capability, and manipulation edge supervision that could converge +with a small amount of data. We term this simple but effective ViT paradigm +IML-ViT, which has significant potential to become a new benchmark for IML. +Extensive experiments on five benchmark datasets verified our model outperforms +the state-of-the-art manipulation localization methods.Code and models are +available at \url{https://github.com/SunnyHaze/IML-ViT}. + +
+
+
+
+
+ + ♻ ☆ Transformer-based interpretable multi-modal data fusion for skin lesion + classification + + +
+ A lot of deep learning (DL) research these days is mainly focused on +improving quantitative metrics regardless of other factors. In human-centered +applications, like skin lesion classification in dermatology, DL-driven +clinical decision support systems are still in their infancy due to the limited +transparency of their decision-making process. Moreover, the lack of procedures +that can explain the behavior of trained DL algorithms leads to almost no trust +from clinical physicians. To diagnose skin lesions, dermatologists rely on +visual assessment of the disease and the data gathered from the patient's +anamnesis. Data-driven algorithms dealing with multi-modal data are limited by +the separation of feature-level and decision-level fusion procedures required +by convolutional architectures. To address this issue, we enable single-stage +multi-modal data fusion via the attention mechanism of transformer-based +architectures to aid in diagnosing skin diseases. Our method beats other +state-of-the-art single- and multi-modal DL architectures in image-rich and +patient-data-rich environments. Additionally, the choice of the architecture +enables native interpretability support for the classification task both in the +image and metadata domain with no additional modifications necessary. + +
+
+ comment: Submitted to IEEE JBHI in July 2023 +
+
+
+
+
+ + ♻ ☆ USAGE: A Unified Seed Area Generation Paradigm for Weakly Supervised + Semantic Segmentation ICCV 2023 + + +
+ Seed area generation is usually the starting point of weakly supervised +semantic segmentation (WSSS). Computing the Class Activation Map (CAM) from a +multi-label classification network is the de facto paradigm for seed area +generation, but CAMs generated from Convolutional Neural Networks (CNNs) and +Transformers are prone to be under- and over-activated, respectively, which +makes the strategies to refine CAMs for CNNs usually inappropriate for +Transformers, and vice versa. In this paper, we propose a Unified optimization +paradigm for Seed Area GEneration (USAGE) for both types of networks, in which +the objective function to be optimized consists of two terms: One is a +generation loss, which controls the shape of seed areas by a temperature +parameter following a deterministic principle for different types of networks; +The other is a regularization loss, which ensures the consistency between the +seed areas that are generated by self-adaptive network adjustment from +different views, to overturn false activation in seed areas. Experimental +results show that USAGE consistently improves seed area generation for both +CNNs and Transformers by large margins, e.g., outperforming state-of-the-art +methods by a mIoU of 4.1% on PASCAL VOC. Moreover, based on the USAGE-generated +seed areas on Transformers, we achieve state-of-the-art WSSS results on both +PASCAL VOC and MS COCO. + +
+
+ comment: ICCV 2023 camera-ready version +
+
+
+
+
+ + ♻ ☆ Leveraging Image-based Generative Adversarial Networks for Time Series + Generation + + +
+ Generative models for images have gained significant attention in computer +vision and natural language processing due to their ability to generate +realistic samples from complex data distributions. To leverage the advances of +image-based generative models for the time series domain, we propose a +two-dimensional image representation for time series, the Extended +Intertemporal Return Plot (XIRP). Our approach captures the intertemporal time +series dynamics in a scale-invariant and invertible way, reducing training time +and improving sample quality. We benchmark synthetic XIRPs obtained by an +off-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image +representations and models regarding similarity and predictive ability metrics. +Our novel, validated image representation for time series consistently and +significantly outperforms a state-of-the-art RNN-based generative model +regarding predictive ability. Further, we introduce an improved stochastic +inversion to substantially improve simulation quality regardless of the +representation and provide the prospect of transfer potentials in other +domains. + +
+
+
+
+
+ + ♻ ☆ Towards Realistic Out-of-Distribution Detection: A Novel Evaluation + Framework for Improving Generalization in OOD Detection + + +
+ This paper presents a novel evaluation framework for Out-of-Distribution +(OOD) detection that aims to assess the performance of machine learning models +in more realistic settings. We observed that the real-world requirements for +testing OOD detection methods are not satisfied by the current testing +protocols. They usually encourage methods to have a strong bias towards a low +level of diversity in normal data. To address this limitation, we propose new +OOD test datasets (CIFAR-10-R, CIFAR-100-R, and ImageNet-30-R) that can allow +researchers to benchmark OOD detection performance under realistic distribution +shifts. Additionally, we introduce a Generalizability Score (GS) to measure the +generalization ability of a model during OOD detection. Our experiments +demonstrate that improving the performance on existing benchmark datasets does +not necessarily improve the usability of OOD detection models in real-world +scenarios. While leveraging deep pre-trained features has been identified as a +promising avenue for OOD detection research, our experiments show that +state-of-the-art pre-trained models tested on our proposed datasets suffer a +significant drop in performance. To address this issue, we propose a +post-processing stage for adapting pre-trained features under these +distribution shifts before calculating the OOD scores, which significantly +enhances the performance of state-of-the-art pre-trained models on our +benchmarks. + +
+
+
+
+
+ + ♻ ☆ LAC: Latent Action Composition for Skeleton-based Action Segmentation ICCV 2023 + + +
+ Skeleton-based action segmentation requires recognizing composable actions in +untrimmed videos. Current approaches decouple this problem by first extracting +local visual features from skeleton sequences and then processing them by a +temporal model to classify frame-wise actions. However, their performances +remain limited as the visual features cannot sufficiently express composable +actions. In this context, we propose Latent Action Composition (LAC), a novel +self-supervised framework aiming at learning from synthesized composable +motions for skeleton-based action segmentation. LAC is composed of a novel +generation module towards synthesizing new sequences. Specifically, we design a +linear latent space in the generator to represent primitive motion. New +composed motions can be synthesized by simply performing arithmetic operations +on latent representations of multiple input skeleton sequences. LAC leverages +such synthesized sequences, which have large diversity and complexity, for +learning visual representations of skeletons in both sequence and frame spaces +via contrastive learning. The resulting visual encoder has a high expressive +power and can be effectively transferred onto action segmentation tasks by +end-to-end fine-tuning without the need for additional temporal models. We +conduct a study focusing on transfer-learning and we show that representations +learned from pre-trained LAC outperform the state-of-the-art by a large margin +on TSU, Charades, PKU-MMD datasets. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Unsupervised Hashing with Similarity Distribution Calibration BMVC 2023 + + +
+ Unsupervised hashing methods typically aim to preserve the similarity between +data points in a feature space by mapping them to binary hash codes. However, +these methods often overlook the fact that the similarity between data points +in the continuous feature space may not be preserved in the discrete hash code +space, due to the limited similarity range of hash codes. The similarity range +is bounded by the code length and can lead to a problem known as similarity +collapse. That is, the positive and negative pairs of data points become less +distinguishable from each other in the hash space. To alleviate this problem, +in this paper a novel Similarity Distribution Calibration (SDC) method is +introduced. SDC aligns the hash code similarity distribution towards a +calibration distribution (e.g., beta distribution) with sufficient spread +across the entire similarity range, thus alleviating the similarity collapse +problem. Extensive experiments show that our SDC outperforms significantly the +state-of-the-art alternatives on coarse category-level and instance-level image +retrieval. Code is available at https://github.com/kamwoh/sdc. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Enhancing the accuracies by performing pooling decisions adjacent to the + output layer + + +
+ Learning classification tasks of (2^nx2^n) inputs typically consist of \le n +(2x2) max-pooling (MP) operators along the entire feedforward deep +architecture. Here we show, using the CIFAR-10 database, that pooling decisions +adjacent to the last convolutional layer significantly enhance accuracies. In +particular, average accuracies of the advanced-VGG with m layers (A-VGGm) +architectures are 0.936, 0.940, 0.954, 0.955, and 0.955 for m=6, 8, 14, 13, and +16, respectively. The results indicate A-VGG8s' accuracy is superior to +VGG16s', and that the accuracies of A-VGG13 and A-VGG16 are equal, and +comparable to that of Wide-ResNet16. In addition, replacing the three fully +connected (FC) layers with one FC layer, A-VGG6 and A-VGG14, or with several +linear activation FC layers, yielded similar accuracies. These significantly +enhanced accuracies stem from training the most influential input-output +routes, in comparison to the inferior routes selected following multiple MP +decisions along the deep architecture. In addition, accuracies are sensitive to +the order of the non-commutative MP and average pooling operators adjacent to +the output layer, varying the number and location of training routes. The +results call for the reexamination of previously proposed deep architectures +and their accuracies by utilizing the proposed pooling strategy adjacent to the +output layer. + +
+
+ comment: 29 pages, 3 figures, 1 table, and Supplementary Information +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision +transformer (CNN-Transformer) for medical image segmentation. The proposed +Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both +the convolution and self-attention mechanisms at each decoding stage with a +nominal memory and computational burden. The inclusion of multi-axis +self-attention, within each decoder stage, significantly enhances the +discriminating capacity between the object and background regions, thereby +helping in improving the segmentation efficiency. In the Hybrid Decoder block, +the fusion process commences by integrating the upsampled lower-level decoder +features, obtained through transpose convolution, with the skip-connection +features derived from the hybrid encoder. Subsequently, the fused features +undergo refinement through the utilization of a multi-axis attention mechanism. +The proposed decoder block is repeated multiple times to progressively segment +the nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset +demonstrates the effectiveness of the proposed technique. Our MaxViT-UNet +outperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet) +techniques by a considerable margin on both of the standard datasets. The +following github (https://github.com/PRLAB21/MaxViT-UNet) contains the +implementation and trained weights. + +
+
+ comment: 17 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ ICF-SRSR: Invertible scale-Conditional Function for Self-Supervised + Real-world Single Image Super-Resolution + + +
+ Single image super-resolution (SISR) is a challenging ill-posed problem that +aims to up-sample a given low-resolution (LR) image to a high-resolution (HR) +counterpart. Due to the difficulty in obtaining real LR-HR training pairs, +recent approaches are trained on simulated LR images degraded by simplified +down-sampling operators, e.g., bicubic. Such an approach can be problematic in +practice because of the large gap between the synthesized and real-world LR +images. To alleviate the issue, we propose a novel Invertible scale-Conditional +Function (ICF), which can scale an input image and then restore the original +input with different scale conditions. By leveraging the proposed ICF, we +construct a novel self-supervised SISR framework (ICF-SRSR) to handle the +real-world SR task without using any paired/unpaired training data. +Furthermore, our ICF-SRSR can generate realistic and feasible LR-HR pairs, +which can make existing supervised SISR networks more robust. Extensive +experiments demonstrate the effectiveness of the proposed method in handling +SISR in a fully self-supervised manner. Our ICF-SRSR demonstrates superior +performance compared to the existing methods trained on synthetic paired images +in real-world scenarios and exhibits comparable performance compared to +state-of-the-art supervised/unsupervised methods on public benchmark datasets. + +
+
+
+
+
+ + ♻ ☆ RemovalNet: DNN Fingerprint Removal Attacks + + +
+ With the performance of deep neural networks (DNNs) remarkably improving, +DNNs have been widely used in many areas. Consequently, the DNN model has +become a valuable asset, and its intellectual property is safeguarded by +ownership verification techniques (e.g., DNN fingerprinting). However, the +feasibility of the DNN fingerprint removal attack and its potential influence +remains an open problem. In this paper, we perform the first comprehensive +investigation of DNN fingerprint removal attacks. Generally, the knowledge +contained in a DNN model can be categorized into general semantic and +fingerprint-specific knowledge. To this end, we propose a min-max bilevel +optimization-based DNN fingerprint removal attack named RemovalNet, to evade +model ownership verification. The lower-level optimization is designed to +remove fingerprint-specific knowledge. While in the upper-level optimization, +we distill the victim model's general semantic knowledge to maintain the +surrogate model's performance. We conduct extensive experiments to evaluate the +fidelity, effectiveness, and efficiency of the RemovalNet against four advanced +defense methods on six metrics. The empirical results demonstrate that (1) the +RemovalNet is effective. After our DNN fingerprint removal attack, the model +distance between the target and surrogate models is x100 times higher than that +of the baseline attacks, (2) the RemovalNet is efficient. It uses only 0.2% +(400 samples) of the substitute dataset and 1,000 iterations to conduct our +attack. Besides, compared with advanced model stealing attacks, the RemovalNet +saves nearly 85% of computational resources at most, (3) the RemovalNet +achieves high fidelity that the created surrogate model maintains high accuracy +after the DNN fingerprint removal process. Our code is available at: +https://github.com/grasses/RemovalNet. + +
+
+ comment: some mistake +
+
+
+
+
+ + ♻ ☆ FusionBooster: A Unified Image Fusion Boosting Paradigm + + +
+ In recent years, numerous ideas have emerged for designing a mutually +reinforcing mechanism or extra stages for the image fusion task, ignoring the +inevitable gaps between different vision tasks and the computational burden. We +argue that there is a scope to improve the fusion performance with the help of +the FusionBooster, a model specifically designed for the fusion task. In +particular, our booster is based on the divide-and-conquer strategy controlled +by an information probe. The booster is composed of three building blocks: the +probe units, the booster layer, and the assembling module. Given the result +produced by a backbone method, the probe units assess the fused image and +divide the results according to their information content. This is instrumental +in identifying missing information, as a step to its recovery. The recovery of +the degraded components along with the fusion guidance are the role of the +booster layer. Lastly, the assembling module is responsible for piecing these +advanced components together to deliver the output. We use concise +reconstruction loss functions in conjunction with lightweight autoencoder +models to formulate the learning task, with marginal computational complexity +increase. The experimental results obtained in various fusion tasks, as well as +downstream detection tasks, consistently demonstrate that the proposed +FusionBooster significantly improves the performance. Our code will be publicly +available on the project homepage. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Collage Diffusion + + +
+ We seek to give users precise control over diffusion-based image generation +by modeling complex scenes as sequences of layers, which define the desired +spatial arrangement and visual attributes of objects in the scene. Collage +Diffusion harmonizes the input layers to make objects fit together -- the key +challenge involves minimizing changes in the positions and key visual +attributes of the input layers while allowing other attributes to change in the +harmonization process. We ensure that objects are generated in the correct +locations by modifying text-image cross-attention with the layers' alpha masks. +We preserve key visual attributes of input layers by learning specialized text +representations per layer and by extending ControlNet to operate on layers. +Layer input allows users to control the extent of image harmonization on a +per-object basis, and users can even iteratively edit individual objects in +generated images while keeping other objects fixed. By leveraging the rich +information present in layer input, Collage Diffusion generates globally +harmonized images that maintain desired object characteristics better than +prior approaches. + +
+
+
+
+
+ + ♻ ☆ StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent + Disentangled Space + + +
+ One major challenge in machine learning applications is coping with +mismatches between the datasets used in the development and those obtained in +real-world applications. These mismatches may lead to inaccurate predictions +and errors, resulting in poor product quality and unreliable systems. In this +study, we propose StyleDiff to inform developers of the differences between the +two datasets for the steady development of machine learning systems. Using +disentangled image spaces obtained from recently proposed generative models, +StyleDiff compares the two datasets by focusing on attributes in the images and +provides an easy-to-understand analysis of the differences between the +datasets. The proposed StyleDiff performs in $O (d N\log N)$, where $N$ is the +size of the datasets and $d$ is the number of attributes, enabling the +application to large datasets. We demonstrate that StyleDiff accurately detects +differences between datasets and presents them in an understandable format +using, for example, driving scenes datasets. + +
+
+ comment: 25 pages, 17 figures, Image and Vision Computing +
+
+
+
+
+ + ♻ ☆ Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning ICCV 2023 + + +
+ Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful +alternative for full fine-tuning so as to adapt pre-trained vision models to +downstream tasks, which only tunes a small number of parameters while freezing +the vast majority ones to ease storage burden and optimization difficulty. +However, existing PEFT methods introduce trainable parameters to the same +positions across different tasks depending solely on human heuristics and +neglect the domain gaps. To this end, we study where to introduce and how to +allocate trainable parameters by proposing a novel Sensitivity-aware visual +Parameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates +trainable parameters to task-specific important positions given a desired +tunable parameter budget. Specifically, our SPT first quickly identifies the +sensitive parameters that require tuning for a given task in a data-dependent +way. Next, our SPT further boosts the representational capability for the +weight matrices whose number of sensitive parameters exceeds a pre-defined +threshold by utilizing existing structured tuning methods, e.g., LoRA [23] or +Adapter [22], to replace directly tuning the selected sensitive parameters +(unstructured tuning) under the budget. Extensive experiments on a wide range +of downstream recognition tasks show that our SPT is complementary to the +existing PEFT methods and largely boosts their performance, e.g., SPT improves +Adapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean +Top-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks, +respectively. Source code is at https://github.com/ziplab/SPT + +
+
+ comment: ICCV 2023 Oral +
+
+
+
+
+ + ♻ ☆ Improving Underwater Visual Tracking With a Large Scale Dataset and + Image Enhancement + + +
+ This paper presents a new dataset and general tracker enhancement method for +Underwater Visual Object Tracking (UVOT). Despite its significance, underwater +tracking has remained unexplored due to data inaccessibility. It poses distinct +challenges; the underwater environment exhibits non-uniform lighting +conditions, low visibility, lack of sharpness, low contrast, camouflage, and +reflections from suspended particles. Performance of traditional tracking +methods designed primarily for terrestrial or open-air scenarios drops in such +conditions. We address the problem by proposing a novel underwater image +enhancement algorithm designed specifically to boost tracking quality. The +method has resulted in a significant performance improvement, of up to 5.0% +AUC, of state-of-the-art (SOTA) visual trackers. To develop robust and accurate +UVOT methods, large-scale datasets are required. To this end, we introduce a +large-scale UVOT benchmark dataset consisting of 400 video segments and 275,000 +manually annotated frames enabling underwater training and evaluation of deep +trackers. The videos are labelled with several underwater-specific tracking +attributes including watercolor variation, target distractors, camouflage, +target relative size, and low visibility conditions. The UVOT400 dataset, +tracking results, and the code are publicly available on: +https://github.com/BasitAlawode/UWVOT400. + +
+
+
+
+
+ + ♻ ☆ PV-SSD: A Projection and Voxel-based Double Branch Single-Stage 3D + Object Detector + + +
+ LIDAR-based 3D object detection and classification is crucial for autonomous +driving. However, inference in real-time from extremely sparse 3D data poses a +formidable challenge. To address this issue, a common approach is to project +point clouds onto a bird's-eye or perspective view, effectively converting them +into an image-like data format. However, this excessive compression of point +cloud data often leads to the loss of information. This paper proposes a 3D +object detector based on voxel and projection double branch feature extraction +(PV-SSD) to address the problem of information loss. We add voxel features +input containing rich local semantic information, which is fully fused with the +projected features in the feature extraction stage to reduce the local +information loss caused by projection. A good performance is achieved compared +to the previous work. In addition, this paper makes the following +contributions: 1) a voxel feature extraction method with variable receptive +fields is proposed; 2) a feature point sampling method by weight sampling is +used to filter out the feature points that are more conducive to the detection +task; 3) the MSSFA module is proposed based on the SSFA module. To verify the +effectiveness of our method, we designed comparison experiments. + +
+
+
+
+
+ + ♻ ☆ SAMedOCT: Adapting Segment Anything Model (SAM) for Retinal OCT + + +
+ The Segment Anything Model (SAM) has gained significant attention in the +field of image segmentation due to its impressive capabilities and prompt-based +interface. While SAM has already been extensively evaluated in various domains, +its adaptation to retinal OCT scans remains unexplored. To bridge this research +gap, we conduct a comprehensive evaluation of SAM and its adaptations on a +large-scale public dataset of OCTs from RETOUCH challenge. Our evaluation +covers diverse retinal diseases, fluid compartments, and device vendors, +comparing SAM against state-of-the-art retinal fluid segmentation methods. +Through our analysis, we showcase adapted SAM's efficacy as a powerful +segmentation model in retinal OCT scans, although still lagging behind +established methods in some circumstances. The findings highlight SAM's +adaptability and robustness, showcasing its utility as a valuable tool in +retinal OCT image analysis and paving the way for further advancements in this +domain. + +
+
+
+
+
+ + ♻ ☆ ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under + Challenging Conditions + + +
+ Robust 3D object detection in extreme weather and illumination conditions is +a challenging task. While radars and thermal cameras are known for their +resilience to these conditions, few studies have been conducted on +radar-thermal fusion due to the lack of corresponding datasets. To address this +gap, we first present a new multi-modal dataset called ThermRad, which includes +a 3D LiDAR, a 4D radar, an RGB camera and a thermal camera. This dataset is +unique because it includes data from all four sensors in extreme weather +conditions, providing a valuable resource for future research in this area. To +validate the robustness of 4D radars and thermal cameras for 3D object +detection in challenging weather conditions, we propose a new multi-modal +fusion method called RTDF-RCNN, which leverages the complementary strengths of +4D radars and thermal cameras to boost object detection performance. To further +prove the effectiveness of our proposed framework, we re-implement +state-of-the-art (SOTA) 3D detectors on our dataset as benchmarks for +evaluation. Our method achieves significant enhancements in detecting cars, +pedestrians, and cyclists, with improvements of over 7.98%, 24.27%, and 27.15%, +respectively, while achieving comparable results to LiDAR-based approaches. Our +contributions in both the ThermRad dataset and the new multi-modal fusion +method provide a new approach to robust 3D object detection in adverse weather +and illumination conditions. The ThermRad dataset will be released. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Learning Deep Intensity Field for Extremely Sparse-View CBCT + Reconstruction MICCAI'23 + + +
+ Sparse-view cone-beam CT (CBCT) reconstruction is an important direction to +reduce radiation dose and benefit clinical applications. Previous voxel-based +generation methods represent the CT as discrete voxels, resulting in high +memory requirements and limited spatial resolution due to the use of 3D +decoders. In this paper, we formulate the CT volume as a continuous intensity +field and develop a novel DIF-Net to perform high-quality CBCT reconstruction +from extremely sparse (fewer than 10) projection views at an ultrafast speed. +The intensity field of a CT can be regarded as a continuous function of 3D +spatial points. Therefore, the reconstruction can be reformulated as regressing +the intensity value of an arbitrary 3D point from given sparse projections. +Specifically, for a point, DIF-Net extracts its view-specific features from +different 2D projection views. These features are subsequently aggregated by a +fusion module for intensity estimation. Notably, thousands of points can be +processed in parallel to improve efficiency during training and testing. In +practice, we collect a knee CBCT dataset to train and evaluate DIF-Net. +Extensive experiments show that our approach can reconstruct CBCT with high +image quality and high spatial resolution from extremely sparse views within +1.6 seconds, significantly outperforming state-of-the-art methods. Our code +will be available at https://github.com/xmed-lab/DIF-Net. + +
+
+ comment: MICCAI'23 +
+
+
+
+
+ + ♻ ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ MacFormer: Map-Agent Coupled Transformer for Real-time and Robust + Trajectory Prediction + + +
+ Predicting the future behavior of agents is a fundamental task in autonomous +vehicle domains. Accurate prediction relies on comprehending the surrounding +map, which significantly regularizes agent behaviors. However, existing methods +have limitations in exploiting the map and exhibit a strong dependence on +historical trajectories, which yield unsatisfactory prediction performance and +robustness. Additionally, their heavy network architectures impede real-time +applications. To tackle these problems, we propose Map-Agent Coupled +Transformer (MacFormer) for real-time and robust trajectory prediction. Our +framework explicitly incorporates map constraints into the network via two +carefully designed modules named coupled map and reference extractor. A novel +multi-task optimization strategy (MTOS) is presented to enhance learning of +topology and rule constraints. We also devise bilateral query scheme in context +fusion for a more efficient and lightweight network. We evaluated our approach +on Argoverse 1, Argoverse 2, and nuScenes real-world benchmarks, where it all +achieved state-of-the-art performance with the lowest inference latency and +smallest model size. Experiments also demonstrate that our framework is +resilient to imperfect tracklet inputs. Furthermore, we show that by combining +with our proposed strategies, classical models outperform their baselines, +further validating the versatility of our framework. + +
+
+ comment: Accepted by IEEE Robotics and Automation Letters. 8 Pages, 9 Figures, + 9 Tables. Video: https://www.youtube.com/watch?v=XY388iI6sPQ +
+
+
+
+
+ + ♻ ☆ Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation + + +
+ Medical image data are often limited due to the expensive acquisition and +annotation process. Hence, training a deep-learning model with only raw data +can easily lead to overfitting. One solution to this problem is to augment the +raw data with various transformations, improving the model's ability to +generalize to new data. However, manually configuring a generic augmentation +combination and parameters for different datasets is non-trivial due to +inconsistent acquisition approaches and data distributions. Therefore, +automatic data augmentation is proposed to learn favorable augmentation +strategies for different datasets while incurring large GPU overhead. To this +end, we present a novel method, called Dynamic Data Augmentation (DDAug), which +is efficient and has negligible computation cost. Our DDAug develops a +hierarchical tree structure to represent various augmentations and utilizes an +efficient Monte-Carlo tree searching algorithm to update, prune, and sample the +tree. As a result, the augmentation pipeline can be optimized for each dataset +automatically. Experiments on multiple Prostate MRI datasets show that our +method outperforms the current state-of-the-art data augmentation strategies. + +
+
+
+
+
+ + ♻ ☆ LRANet: Towards Accurate and Efficient Scene Text Detection with + Low-Rank Approximation Network + + +
+ Recently, regression-based methods, which predict parameterized text shapes +for text localization, have gained popularity in scene text detection. However, +the existing parameterized text shape methods still have limitations in +modeling arbitrary-shaped texts due to ignoring the utilization of +text-specific shape information. Moreover, the time consumption of the entire +pipeline has been largely overlooked, leading to a suboptimal overall inference +speed. To address these issues, we first propose a novel parameterized text +shape method based on low-rank approximation. Unlike other shape representation +methods that employ data-irrelevant parameterization, our approach utilizes +singular value decomposition and reconstructs the text shape using a few +eigenvectors learned from labeled text contours. By exploring the shape +correlation among different text contours, our method achieves consistency, +compactness, simplicity, and robustness in shape representation. Next, we +propose a dual assignment scheme for speed acceleration. It adopts a sparse +assignment branch to accelerate the inference speed, and meanwhile, provides +ample supervised signals for training through a dense assignment branch. +Building upon these designs, we implement an accurate and efficient +arbitrary-shaped text detector named LRANet. Extensive experiments are +conducted on several challenging benchmarks, demonstrating the superior +accuracy and efficiency of LRANet compared to state-of-the-art methods. Code +will be released soon. + +
+
+
+
+
+ + ♻ ☆ Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning + + +
+ Federated learning (FL) enables multiple clients to collaboratively train a +global model without disclosing their data. Previous researches often require +training the complete model parameters. However, the emergence of powerful +pre-trained models makes it possible to achieve higher performance with fewer +learnable parameters in FL. In this paper, we propose a federated adaptive +prompt tuning algorithm, FedAPT, for multi-domain collaborative image +classification with powerful foundation models, like CLIP. Compared with direct +federated prompt tuning, our core idea is to adaptively unlock specific domain +knowledge for each test sample in order to provide them with personalized +prompts. To implement this idea, we design an adaptive prompt tuning module, +which consists of a meta prompt, an adaptive network, and some keys. The server +randomly generates a set of keys and assigns a unique key to each client. Then +all clients cooperatively train the global adaptive network and meta prompt +with the local datasets and the frozen keys. Ultimately, the global aggregation +model can assign a personalized prompt to CLIP based on the domain features of +each test sample. We perform extensive experiments on two multi-domain image +classification datasets across two different settings - supervised and +unsupervised. The results show that FedAPT can achieve better performance with +less than 10\% of the number of parameters of the fully trained model, and the +global model can perform well in diverse client domains simultaneously. + +
+
+
+
+
+ + ♻ ☆ Collaborative Chinese Text Recognition with Personalized Federated + Learning + + +
+ In Chinese text recognition, to compensate for the insufficient local data +and improve the performance of local few-shot character recognition, it is +often necessary for one organization to collect a large amount of data from +similar organizations. However, due to the natural presence of private +information in text data, such as addresses and phone numbers, different +organizations are unwilling to share private data. Therefore, it becomes +increasingly important to design a privacy-preserving collaborative training +framework for the Chinese text recognition task. In this paper, we introduce +personalized federated learning (pFL) into the Chinese text recognition task +and propose the pFedCR algorithm, which significantly improves the model +performance of each client (organization) without sharing private data. +Specifically, pFedCR comprises two stages: multiple rounds of global model +training stage and the the local personalization stage. During stage 1, an +attention mechanism is incorporated into the CRNN model to adapt to various +client data distributions. Leveraging inherent character data characteristics, +a balanced dataset is created on the server to mitigate character imbalance. In +the personalization phase, the global model is fine-tuned for one epoch to +create a local model. Parameter averaging between local and global models +combines personalized and global feature extraction capabilities. Finally, we +fine-tune only the attention layers to enhance its focus on local personalized +features. The experimental results on three real-world industrial scenario +datasets show that the pFedCR algorithm can improve the performance of local +personalized models by about 20\% while also improving their generalization +performance on other client data domains. Compared to other state-of-the-art +personalized federated learning methods, pFedCR improves performance by 6\% +$\sim$ 8\%. + +
+
+
+
+
+ + ♻ ☆ RECLIP: Resource-efficient CLIP by Training with Small Images + + +
+ We present RECLIP (Resource-efficient CLIP), a simple method that minimizes +computational resource footprint for CLIP (Contrastive Language Image +Pretraining). Inspired by the notion of coarse-to-fine in computer vision, we +leverage small images to learn from large-scale language supervision +efficiently, and finetune the model with high-resolution data in the end. Since +the complexity of the vision transformer heavily depends on input image size, +our approach significantly reduces the training resource requirements both in +theory and in practice. Using the same batch size and training epoch, RECLIP +achieves highly competitive zero-shot classification and image-text retrieval +accuracy with 6 to 8x less computational resources and 7 to 9x fewer FLOPs than +the baseline. Compared to the state-of-the-art contrastive learning methods, +RECLIP demonstrates 5 to 59x training resource savings while maintaining highly +competitive zero-shot classification and retrieval performance. Finally, RECLIP +matches the state of the art in transfer learning to open-vocabulary detection +tasks, achieving 32 APr on LVIS. We hope this work will pave the path for the +broader research community to explore language supervised pretraining in +resource-friendly settings. + +
+
+ comment: Published at Transactions on Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Group DETR: Fast DETR Training with Group-Wise One-to-Many Assignment ICCV23 + + +
+ Detection transformer (DETR) relies on one-to-one assignment, assigning one +ground-truth object to one prediction, for end-to-end detection without NMS +post-processing. It is known that one-to-many assignment, assigning one +ground-truth object to multiple predictions, succeeds in detection methods such +as Faster R-CNN and FCOS. While the naive one-to-many assignment does not work +for DETR, and it remains challenging to apply one-to-many assignment for DETR +training. In this paper, we introduce Group DETR, a simple yet efficient DETR +training approach that introduces a group-wise way for one-to-many assignment. +This approach involves using multiple groups of object queries, conducting +one-to-one assignment within each group, and performing decoder self-attention +separately. It resembles data augmentation with automatically-learned object +query augmentation. It is also equivalent to simultaneously training +parameter-sharing networks of the same architecture, introducing more +supervision and thus improving DETR training. The inference process is the same +as DETR trained normally and only needs one group of queries without any +architecture modification. Group DETR is versatile and is applicable to various +DETR variants. The experiments show that Group DETR significantly speeds up the +training convergence and improves the performance of various DETR-based models. +Code will be available at \url{https://github.com/Atten4Vis/GroupDETR}. + +
+
+ comment: ICCV23 camera ready version +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding and Improving Adversarial + Transferability from Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: IEEE Symposium on Security and Privacy (Oakland) 2024; Extended + version of camera-ready +
+
+
+
+
+ + ♻ ☆ Quaternion-valued Correlation Learning for Few-Shot Semantic + Segmentation + + +
+ Few-shot segmentation (FSS) aims to segment unseen classes given only a few +annotated samples. Encouraging progress has been made for FSS by leveraging +semantic features learned from base classes with sufficient training samples to +represent novel classes. The correlation-based methods lack the ability to +consider interaction of the two subspace matching scores due to the inherent +nature of the real-valued 2D convolutions. In this paper, we introduce a +quaternion perspective on correlation learning and propose a novel +Quaternion-valued Correlation Learning Network (QCLNet), with the aim to +alleviate the computational burden of high-dimensional correlation tensor and +explore internal latent interaction between query and support images by +leveraging operations defined by the established quaternion algebra. +Specifically, our QCLNet is formulated as a hyper-complex valued network and +represents correlation tensors in the quaternion domain, which uses +quaternion-valued convolution to explore the external relations of query +subspace when considering the hidden relationship of the support sub-dimension +in the quaternion space. Extensive experiments on the PASCAL-5i and COCO-20i +datasets demonstrate that our method outperforms the existing state-of-the-art +methods effectively. Our code is available at +https://github.com/zwzheng98/QCLNet and our article "Quaternion-valued +Correlation Learning for Few-Shot Semantic Segmentation" was published in IEEE +Transactions on Circuits and Systems for Video Technology, vol. +33,no.5,pp.2102-2115,May 2023,doi: 10.1109/TCSVT.2022.3223150. + +
+
+ comment: for associated paper file, see + https://ieeexplore.ieee.org/document/9954424?source=authoralert +
+
+
+
+
+ + ♻ ☆ SAM-PARSER: Fine-tuning SAM Efficiently by Parameter Space + Reconstruction + + +
+ Segment Anything Model (SAM) has received remarkable attention as it offers a +powerful and versatile solution for object segmentation in images. However, +fine-tuning SAM for downstream segmentation tasks under different scenarios +remains a challenge, as the varied characteristics of different scenarios +naturally requires diverse model parameter spaces. Most existing fine-tuning +methods attempt to bridge the gaps among different scenarios by introducing a +set of new parameters to modify SAM's original parameter space. Unlike these +works, in this paper, we propose fine-tuning SAM efficiently by parameter space +reconstruction (SAM-PARSER), which introduce nearly zero trainable parameters +during fine-tuning. In SAM-PARSER, we assume that SAM's original parameter +space is relatively complete, so that its bases are able to reconstruct the +parameter space of a new scenario. We obtain the bases by matrix decomposition, +and fine-tuning the coefficients to reconstruct the parameter space tailored to +the new scenario by an optimal linear combination of the bases. Experimental +results show that SAM-PARSER exhibits superior segmentation performance across +various scenarios, while reducing the number of trainable parameters by +$\approx 290$ times compared with current parameter-efficient fine-tuning +methods. + +
+
+
+
+
+ + ♻ ☆ Visual correspondence-based explanations improve AI robustness and + human-AI team accuracy NeurIPS 2022 + + +
+ Explaining artificial intelligence (AI) predictions is increasingly important +and even imperative in many high-stakes applications where humans are the +ultimate decision-makers. In this work, we propose two novel architectures of +self-interpretable image classifiers that first explain, and then predict (as +opposed to post-hoc explanations) by harnessing the visual correspondences +between a query image and exemplars. Our models consistently improve (by 1 to 4 +points) on out-of-distribution (OOD) datasets while performing marginally worse +(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest +neighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB, +our correspondence-based explanations are found to be more useful to users than +kNN explanations. Our explanations help users more accurately reject AI's wrong +decisions than all other tested methods. Interestingly, for the first time, we +show that it is possible to achieve complementary human-AI team accuracy (i.e., +that is higher than either AI-alone or human-alone), in ImageNet and CUB image +classification tasks. + +
+
+ comment: NeurIPS 2022 conference paper +
+
+
+
+
+ + ♻ ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ♻ ☆ CircleFormer: Circular Nuclei Detection in Whole Slide Images with + Circle Queries and Attention MICCAI 2023 + + +
+ Both CNN-based and Transformer-based object detection with bounding box +representation have been extensively studied in computer vision and medical +image analysis, but circular object detection in medical images is still +underexplored. Inspired by the recent anchor free CNN-based circular object +detection method (CircleNet) for ball-shape glomeruli detection in renal +pathology, in this paper, we present CircleFormer, a Transformer-based circular +medical object detection with dynamic anchor circles. Specifically, queries +with circle representation in Transformer decoder iteratively refine the +circular object detection results, and a circle cross attention module is +introduced to compute the similarity between circular queries and image +features. A generalized circle IoU (gCIoU) is proposed to serve as a new +regression loss of circular object detection as well. Moreover, our approach is +easy to generalize to the segmentation task by adding a simple segmentation +branch to CircleFormer. We evaluate our method in circular nuclei detection and +segmentation on the public MoNuSeg dataset, and the experimental results show +that our method achieves promising performance compared with the +state-of-the-art approaches. The effectiveness of each component is validated +via ablation studies as well. Our code is released at +https://github.com/zhanghx-iim-ahu/CircleFormer. + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ A Region-based Randers Geodesic Approach for Image Segmentation + + +
+ The geodesic model based on the eikonal partial differential equation (PDE) +has served as a fundamental tool for the applications of image segmentation and +boundary detection in the past two decades. However, the existing approaches +commonly only exploit the image edge-based features for computing minimal +geodesic paths, potentially limiting their performance in complicated +segmentation situations. In this paper, we introduce a new variational image +segmentation model based on the minimal geodesic path framework and the eikonal +PDE, where the region-based appearance term that defines then regional +homogeneity features can be taken into account for estimating the associated +minimal geodesic paths. This is done by constructing a Randers geodesic metric +interpretation of the region-based active contour energy functional. As a +result, the minimization of the active contour energy functional is transformed +into finding the solution to the Randers eikonal PDE. + We also suggest a practical interactive image segmentation strategy, where +the target boundary can be delineated by the concatenation of several piecewise +geodesic paths. We invoke the Finsler variant of the fast marching method to +estimate the geodesic distance map, yielding an efficient implementation of the +proposed region-based Randers geodesic model for image segmentation. +Experimental results on both synthetic and real images exhibit that our model +indeed achieves encouraging segmentation performance. + +
+
+ comment: To Appear in International Journal of Computer Vision +
+
+
+
+
+ + ♻ ☆ MMVP: Motion-Matrix-based Video Prediction ICCV 2023 + + +
+ A central challenge of video prediction lies where the system has to reason +the objects' future motions from image frames while simultaneously maintaining +the consistency of their appearances across frames. This work introduces an +end-to-end trainable two-stream video prediction framework, Motion-Matrix-based +Video Prediction (MMVP), to tackle this challenge. Unlike previous methods that +usually handle motion prediction and appearance maintenance within the same set +of modules, MMVP decouples motion and appearance information by constructing +appearance-agnostic motion matrices. The motion matrices represent the temporal +similarity of each and every pair of feature patches in the input frames, and +are the sole input of the motion prediction module in MMVP. This design +improves video prediction in both accuracy and efficiency, and reduces the +model size. Results of extensive experiments demonstrate that MMVP outperforms +state-of-the-art systems on public data sets by non-negligible large margins +(about 1 db in PSNR, UCF Sports) in significantly smaller model sizes (84% the +size or smaller). + +
+
+ comment: ICCV 2023 (Oral) +
+
+
+
+
+ + ♻ ☆ Neural Video Compression with Temporal Layer-Adaptive Hierarchical + B-frame Coding + + +
+ Neural video compression (NVC) is a rapidly evolving video coding research +area, with some models achieving superior coding efficiency compared to the +latest video coding standard Versatile Video Coding (VVC). In conventional +video coding standards, the hierarchical B-frame coding, which utilizes a +bidirectional prediction structure for higher compression, had been +well-studied and exploited. In NVC, however, limited research has investigated +the hierarchical B scheme. In this paper, we propose an NVC model exploiting +hierarchical B-frame coding with temporal layer-adaptive optimization. We first +extend an existing unidirectional NVC model to a bidirectional model, which +achieves -21.13% BD-rate gain over the unidirectional baseline model. However, +this model faces challenges when applied to sequences with complex or large +motions, leading to performance degradation. To address this, we introduce +temporal layer-adaptive optimization, incorporating methods such as temporal +layer-adaptive quality scaling (TAQS) and temporal layer-adaptive latent +scaling (TALS). The final model with the proposed methods achieves an +impressive BD-rate gain of -39.86% against the baseline. It also resolves the +challenges in sequences with large or complex motions with up to -49.13% more +BD-rate gains than the simple bidirectional extension. This improvement is +attributed to the allocation of more bits to lower temporal layers, thereby +enhancing overall reconstruction quality with smaller bits. Since our method +has little dependency on a specific NVC model architecture, it can serve as a +general tool for extending unidirectional NVC models to the ones with +hierarchical B-frame coding. + +
+
+
+
+
+ + ♻ ☆ Feature Extractor Stacking for Cross-domain Few-shot Meta-learning + + +
+ Cross-domain few-shot meta-learning (CDFSML) addresses learning problems +where knowledge needs to be transferred from several source domains into an +instance-scarce target domain with an explicitly different distribution. +Recently published CDFSML methods generally construct a universal model that +combines knowledge of multiple source domains into one backbone feature +extractor. This enables efficient inference but necessitates re-computation of +the backbone whenever a new source domain is added. Some of these methods are +also incompatible with heterogeneous source domain backbone architectures. We +propose feature extractor stacking (FES), a new CDFSML method for combining +information from a collection of backbones, which can utilise heterogeneous +pretrained backbones out of the box, and does not maintain a universal model +that needs to be re-computed when its backbone collection is updated. We +present the basic FES algorithm, which is inspired by the classic stacking +approach to meta-learning, and also introduce two variants: convolutional FES +(ConFES) and regularised FES (ReFES). Given a target-domain task, these +algorithms fine-tune each backbone independently, use cross-validation to +extract meta training data from the support set, and learn a simple linear +meta-classifier from this data. We evaluate our FES methods on the well-known +Meta-Dataset benchmark, targeting image classification with convolutional +neural networks, and show that they can achieve state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Foundation Models utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. In this paper, we propose a +new framework that includes the Domain Foundation Model (DFM), bridging the gap +between the General Foundation Model (GFM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +tried several Parameter-Efficient Fine-Tuning methods on RS5M to implement the +DFM. Experimental results show that our proposed dataset are highly effective +for various tasks, improving upon the baseline by $8 \% \sim 16 \%$ in +zero-shot classification tasks, and obtaining good results in both +Vision-Language Retrieval and Semantic Localization tasks. +\url{https://github.com/om-ai-lab/RS5M} + +
+
+ comment: RS5M dataset v4 +
+
+
+
+
+ + ♻ ☆ Exploring the Relationship between Samples and Masks for Robust Defect + Localization + + +
+ Defect detection aims to detect and localize regions out of the normal +distribution.Previous approaches model normality and compare it with the input +to identify defective regions, potentially limiting their generalizability.This +paper proposes a one-stage framework that detects defective patterns directly +without the modeling process.This ability is adopted through the joint efforts +of three parties: a generative adversarial network (GAN), a newly proposed +scaled pattern loss, and a dynamic masked cycle-consistent auxiliary network. +Explicit information that could indicate the position of defects is +intentionally excluded to avoid learning any direct mapping.Experimental +results on the texture class of the challenging MVTec AD dataset show that the +proposed method is 2.9% higher than the SOTA methods in F1-Score, while +substantially outperforming SOTA methods in generalizability. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Augmentation Framework for Anomaly Detection + + +
+ Data augmentation methods are commonly integrated into the training of +anomaly detection models. Previous approaches have primarily focused on +replicating real-world anomalies or enhancing diversity, without considering +that the standard of anomaly varies across different classes, potentially +leading to a biased training distribution.This paper analyzes crucial traits of +simulated anomalies that contribute to the training of reconstructive networks +and condenses them into several methods, thus creating a comprehensive +framework by selectively utilizing appropriate combinations.Furthermore, we +integrate this framework with a reconstruction-based approach and concurrently +propose a split training strategy that alleviates the issue of overfitting +while avoiding introducing interference to the reconstruction process. The +evaluations conducted on the MVTec anomaly detection dataset demonstrate that +our method outperforms the previous state-of-the-art approach, particularly in +terms of object classes. To evaluate generalizability, we generate a simulated +dataset comprising anomalies with diverse characteristics since the original +test samples only include specific types of anomalies and may lead to biased +evaluations. Experimental results demonstrate that our approach exhibits +promising potential for generalizing effectively to various unforeseen +anomalies encountered in real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ A Robust and Interpretable Deep Learning Framework for Multi-modal + Registration via Keypoints + + +
+ We present KeyMorph, a deep learning-based image registration framework that +relies on automatically detecting corresponding keypoints. State-of-the-art +deep learning methods for registration often are not robust to large +misalignments, are not interpretable, and do not incorporate the symmetries of +the problem. In addition, most models produce only a single prediction at +test-time. Our core insight which addresses these shortcomings is that +corresponding keypoints between images can be used to obtain the optimal +transformation via a differentiable closed-form expression. We use this +observation to drive the end-to-end learning of keypoints tailored for the +registration task, and without knowledge of ground-truth keypoints. This +framework not only leads to substantially more robust registration but also +yields better interpretability, since the keypoints reveal which parts of the +image are driving the final alignment. Moreover, KeyMorph can be designed to be +equivariant under image translations and/or symmetric with respect to the input +image ordering. Finally, we show how multiple deformation fields can be +computed efficiently and in closed-form at test time corresponding to different +transformation variants. We demonstrate the proposed framework in solving 3D +affine and spline-based registration of multi-modal brain MRI scans. In +particular, we show registration accuracy that surpasses current +state-of-the-art methods, especially in the context of large displacements. Our +code is available at https://github.com/alanqrwang/keymorph. + +
+
+ comment: Accepted to Medical Image Analysis 2023 +
+
+
+
+
+ + ♻ ☆ Towards Geospatial Foundation Models via Continual Pretraining ICCV 2023 + + +
+ Geospatial technologies are becoming increasingly essential in our world for +a wide range of applications, including agriculture, urban planning, and +disaster response. To help improve the applicability and performance of deep +learning models on these geospatial tasks, various works have begun +investigating foundation models for this domain. Researchers have explored two +prominent approaches for introducing such models in geospatial applications, +but both have drawbacks in terms of limited performance benefit or prohibitive +training cost. Therefore, in this work, we propose a novel paradigm for +building highly effective geospatial foundation models with minimal resource +cost and carbon impact. We first construct a compact yet diverse dataset from +multiple sources to promote feature diversity, which we term GeoPile. Then, we +investigate the potential of continual pretraining from large-scale +ImageNet-22k models and propose a multi-objective continual pretraining +paradigm, which leverages the strong representations of ImageNet while +simultaneously providing the freedom to learn valuable in-domain features. Our +approach outperforms previous state-of-the-art geospatial pretraining methods +in an extensive evaluation on seven downstream datasets covering various tasks +such as change detection, classification, multi-label classification, semantic +segmentation, and super-resolution. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Model sparsification in deep learning promotes simpler, more interpretable +models with fewer parameters. This not only reduces the model's memory +footprint and computational needs but also shortens inference time. This work +focuses on creating sparse models optimized for multiple tasks with fewer +parameters. These parsimonious models also possess the potential to match or +outperform dense models in terms of performance. In this work, we introduce +channel-wise l1/l2 group sparsity in the shared convolutional layers parameters +(or weights) of the multi-task learning model. This approach facilitates the +removal of extraneous groups i.e., channels (due to l1 regularization) and also +imposes a penalty on the weights, further enhancing the learning efficiency for +all tasks (due to l2 regularization). We analyzed the results of group sparsity +in both single-task and multi-task settings on two widely-used Multi-Task +Learning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which +consist of three different computer vision tasks each, multi-task models with +approximately 70% sparsity outperform their dense equivalents. We also +investigate how changing the degree of sparsification influences the model's +performance, the overall sparsity percentage, the patterns of sparsity, and the +inference time. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Efficient Benchmarking (of Language Models) + + +
+ The increasing versatility of language models LMs has given rise to a new +class of benchmarks that comprehensively assess a broad range of capabilities. +Such benchmarks are associated with massive computational costs reaching +thousands of GPU hours per model. However the efficiency aspect of these +evaluation efforts had raised little discussion in the literature. In this work +we present the problem of Efficient Benchmarking namely intelligently reducing +the computation costs of LM evaluation without compromising reliability. Using +the HELM benchmark as a test case we investigate how different benchmark design +choices affect the computation-reliability tradeoff. We propose to evaluate the +reliability of such decisions by using a new measure Decision Impact on +Reliability DIoR for short. We find for example that the current leader on HELM +may change by merely removing a low-ranked model from the benchmark and observe +that a handful of examples suffice to obtain the correct benchmark ranking. +Conversely a slightly different choice of HELM scenarios varies ranking widely. +Based on our findings we outline a set of concrete recommendations for more +efficient benchmark design and utilization practices leading to dramatic cost +savings with minimal loss of benchmark reliability often reducing computation +by x100 or more. + +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Co-evolving Vector Quantization for ID-based Recommendation + + +
+ Category information plays a crucial role in enhancing the quality and +personalization of recommendations. Nevertheless, the availability of item +category information is not consistently present, particularly in the context +of ID-based recommendations. In this work, we propose an alternative approach +to automatically learn and generate entity (i.e., user and item) categorical +information at different levels of granularity, specifically for ID-based +recommendation. Specifically, we devise a co-evolving vector quantization +framework, namely COVE, which enables the simultaneous learning and refinement +of code representation and entity embedding in an end-to-end manner, starting +from the randomly initialized states. With its high adaptability, COVE can be +easily integrated into existing recommendation models. We validate the +effectiveness of COVE on various recommendation tasks including list +completion, collaborative filtering, and click-through rate prediction, across +different recommendation models. We will publish the code and data for other +researchers to reproduce our work. + +
+
+
+
+
+ + ☆ Context Aware Query Rewriting for Text Rankers using LLM + + +
+ Query rewriting refers to an established family of approaches that are +applied to underspecified and ambiguous queries to overcome the vocabulary +mismatch problem in document ranking. Queries are typically rewritten during +query processing time for better query modelling for the downstream ranker. +With the advent of large-language models (LLMs), there have been initial +investigations into using generative approaches to generate pseudo documents to +tackle this inherent vocabulary gap. In this work, we analyze the utility of +LLMs for improved query rewriting for text ranking tasks. We find that there +are two inherent limitations of using LLMs as query re-writers -- concept drift +when using only queries as prompts and large inference costs during query +processing. We adopt a simple, yet surprisingly effective, approach called +context aware query rewriting (CAR) to leverage the benefits of LLMs for query +understanding. Firstly, we rewrite ambiguous training queries by context-aware +prompting of LLMs, where we use only relevant documents as context.Unlike +existing approaches, we use LLM-based query rewriting only during the training +phase. Eventually, a ranker is fine-tuned on the rewritten queries instead of +the original queries during training. In our extensive experiments, we find +that fine-tuning a ranker using re-written queries offers a significant +improvement of up to 33% on the passage ranking task and up to 28% on the +document ranking task when compared to the baseline performance of using +original queries. + +
+
+
+
+
+ + ☆ Concentrating on the Impact: Consequence-based Explanations in + Recommender Systems + + +
+ Recommender systems assist users in decision-making, where the presentation +of recommended items and their explanations are critical factors for enhancing +the overall user experience. Although various methods for generating +explanations have been proposed, there is still room for improvement, +particularly for users who lack expertise in a specific item domain. In this +study, we introduce the novel concept of \textit{consequence-based +explanations}, a type of explanation that emphasizes the individual impact of +consuming a recommended item on the user, which makes the effect of following +recommendations clearer. We conducted an online user study to examine our +assumption about the appreciation of consequence-based explanations and their +impacts on different explanation aims in recommender systems. Our findings +highlight the importance of consequence-based explanations, which were +well-received by users and effectively improved user satisfaction in +recommender systems. These results provide valuable insights for designing +engaging explanations that can enhance the overall user experience in +decision-making. + +
+
+ comment: Preprint of the paper to be presented at IntRS'23: Joint Workshop on + Interfaces and Human Decision Making for Recommender Systems, September 18, + 2023, Singapore. paper will be published in the workshop proceedings +
+
+
+
+
+ + ☆ Towards Long-Tailed Recognition for Graph Classification via + Collaborative Experts + + +
+ Graph classification, aiming at learning the graph-level representations for +effective class assignments, has received outstanding achievements, which +heavily relies on high-quality datasets that have balanced class distribution. +In fact, most real-world graph data naturally presents a long-tailed form, +where the head classes occupy much more samples than the tail classes, it thus +is essential to study the graph-level classification over long-tailed data +while still remaining largely unexplored. However, most existing long-tailed +learning methods in visions fail to jointly optimize the representation +learning and classifier training, as well as neglect the mining of the +hard-to-classify classes. Directly applying existing methods to graphs may lead +to sub-optimal performance, since the model trained on graphs would be more +sensitive to the long-tailed distribution due to the complex topological +characteristics. Hence, in this paper, we propose a novel long-tailed +graph-level classification framework via Collaborative Multi-expert Learning +(CoMe) to tackle the problem. To equilibrate the contributions of head and tail +classes, we first develop balanced contrastive learning from the view of +representation learning, and then design an individual-expert classifier +training based on hard class mining. In addition, we execute gated fusion and +disentangled knowledge distillation among the multiple experts to promote the +collaboration in a multi-expert framework. Comprehensive experiments are +performed on seven widely-used benchmark datasets to demonstrate the +superiority of our method CoMe over state-of-the-art baselines. + +
+
+ comment: Accepted by IEEE Transactions on Big Data (TBD 2024) +
+
+
+
+
+ + ☆ Recommender AI Agent: Integrating Large Language Models for Interactive + Recommendations + + +
+ Recommender models excel at providing domain-specific item recommendations by +leveraging extensive user behavior data. Despite their ability to act as +lightweight domain experts, they struggle to perform versatile tasks such as +providing explanations and engaging in conversations. On the other hand, large +language models (LLMs) represent a significant step towards artificial general +intelligence, showcasing remarkable capabilities in instruction comprehension, +commonsense reasoning, and human interaction. However, LLMs lack the knowledge +of domain-specific item catalogs and behavioral patterns, particularly in areas +that diverge from general world knowledge, such as online e-commerce. +Finetuning LLMs for each domain is neither economic nor efficient. + In this paper, we bridge the gap between recommender models and LLMs, +combining their respective strengths to create a versatile and interactive +recommender system. We introduce an efficient framework called RecAgent, which +employs LLMs as the brain and recommender models as tools. We first outline a +minimal set of essential tools required to transform LLMs into RecAgent. We +then propose an efficient workflow within RecAgent for task execution, +incorporating key components such as a memory bus, dynamic +demonstration-augmented task planning, and reflection. RecAgent enables +traditional recommender systems, such as those ID-based matrix factorization +models, to become interactive systems with a natural language interface through +the integration of LLMs. Experimental results on several public datasets show +that RecAgent achieves satisfying performance as a conversational recommender +system, outperforming general-purpose LLMs. + +
+
+ comment: 16 pages, 15 figures, 4 tables +
+
+
+
+
+ + ☆ AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR + Prediction + + +
+ Click-through rate (CTR) prediction is a crucial issue in recommendation +systems. There has been an emergence of various public CTR datasets. However, +existing datasets primarily suffer from the following limitations. Firstly, +users generally click different types of items from multiple scenarios, and +modeling from multiple scenarios can provide a more comprehensive understanding +of users. Existing datasets only include data for the same type of items from a +single scenario. Secondly, multi-modal features are essential in multi-scenario +prediction as they address the issue of inconsistent ID encoding between +different scenarios. The existing datasets are based on ID features and lack +multi-modal features. Third, a large-scale dataset can provide a more reliable +evaluation of models, fully reflecting the performance differences between +models. The scale of existing datasets is around 100 million, which is +relatively small compared to the real-world CTR prediction. To address these +limitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset +based on industrial data from Alipay. Specifically, AntM$^{2}$C provides the +following advantages: 1) It covers CTR data of 5 different types of items, +providing insights into the preferences of users for different items, including +advertisements, vouchers, mini-programs, contents, and videos. 2) Apart from +ID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text +and image features, which can effectively establish connections between items +with different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200 +features, including 200 million users and 6 million items. It is currently the +largest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several +typical CTR tasks and provide comparisons with baseline methods. The dataset +homepage is available at https://www.atecup.cn/home. + +
+
+
+
+
+ + ♻ ☆ Alleviating Video-Length Effect for Micro-video Recommendation + + +
+ Micro-videos platforms such as TikTok are extremely popular nowadays. One +important feature is that users no longer select interested videos from a set, +instead they either watch the recommended video or skip to the next one. As a +result, the time length of users' watching behavior becomes the most important +signal for identifying preferences. However, our empirical data analysis has +shown a video-length effect that long videos are easier to receive a higher +value of average view time, thus adopting such view-time labels for measuring +user preferences can easily induce a biased model that favors the longer +videos. In this paper, we propose a Video Length Debiasing Recommendation +(VLDRec) method to alleviate such an effect for micro-video recommendation. +VLDRec designs the data labeling approach and the sample generation module that +better capture user preferences in a view-time oriented manner. It further +leverages the multi-task learning technique to jointly optimize the above +samples with original biased ones. Extensive experiments show that VLDRec can +improve the users' view time by 1.81% and 11.32% on two real-world datasets, +given a recommendation list of a fixed overall video length, compared with the +best baseline method. Moreover, VLDRec is also more effective in matching +users' interests in terms of the video content. + +
+
+ comment: Accept by TOIS +
+
+
+
+
+ + ♻ ☆ ONCE: Boosting Content-based Recommendation with Both Open- and + Closed-source Large Language Models + + +
+ Personalized content-based recommender systems have become indispensable +tools for users to navigate through the vast amount of content available on +platforms like daily news websites and book recommendation services. However, +existing recommenders face significant challenges in understanding the content +of items. Large language models (LLMs), which possess deep semantic +comprehension and extensive knowledge from pretraining, have proven to be +effective in various natural language processing tasks. In this study, we +explore the potential of leveraging both open- and closed-source LLMs to +enhance content-based recommendation. With open-source LLMs, we utilize their +deep layers as content encoders, enriching the representation of content at the +embedding level. For closed-source LLMs, we employ prompting techniques to +enrich the training data at the token level. Through comprehensive experiments, +we demonstrate the high effectiveness of both types of LLMs and show the +synergistic relationship between them. Notably, we observed a significant +relative improvement of up to 19.32% compared to existing state-of-the-art +recommendation models. These findings highlight the immense potential of both +open- and closed-source of LLMs in enhancing content-based recommendation +systems. We will make our code and LLM-generated data available for other +researchers to reproduce our results. + +
+
+
+
+
+ + ♻ ☆ Framework to Automatically Determine the Quality of Open Data Catalogs + + +
+ Data catalogs play a crucial role in modern data-driven organizations by +facilitating the discovery, understanding, and utilization of diverse data +assets. However, ensuring their quality and reliability is complex, especially +in open and large-scale data environments. This paper proposes a framework to +automatically determine the quality of open data catalogs, addressing the need +for efficient and reliable quality assessment mechanisms. Our framework can +analyze various core quality dimensions, such as accuracy, completeness, +consistency, scalability, and timeliness, offer several alternatives for the +assessment of compatibility and similarity across such catalogs as well as the +implementation of a set of non-core quality dimensions such as provenance, +readability, and licensing. The goal is to empower data-driven organizations to +make informed decisions based on trustworthy and well-curated data assets. The +source code that illustrates our approach can be downloaded from +https://www.github.com/jorge-martinez-gil/dataq/. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ♻ ☆ Unsupervised Hashing with Similarity Distribution Calibration BMVC 2023 + + +
+ Unsupervised hashing methods typically aim to preserve the similarity between +data points in a feature space by mapping them to binary hash codes. However, +these methods often overlook the fact that the similarity between data points +in the continuous feature space may not be preserved in the discrete hash code +space, due to the limited similarity range of hash codes. The similarity range +is bounded by the code length and can lead to a problem known as similarity +collapse. That is, the positive and negative pairs of data points become less +distinguishable from each other in the hash space. To alleviate this problem, +in this paper a novel Similarity Distribution Calibration (SDC) method is +introduced. SDC aligns the hash code similarity distribution towards a +calibration distribution (e.g., beta distribution) with sufficient spread +across the entire similarity range, thus alleviating the similarity collapse +problem. Extensive experiments show that our SDC outperforms significantly the +state-of-the-art alternatives on coarse category-level and instance-level image +retrieval. Code is available at https://github.com/kamwoh/sdc. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+
+
+
+ + Machine Learning 148 + +
+
+
+ + ☆ A Note on Randomized Kaczmarz Algorithm for Solving Doubly-Noisy Linear + Systems + + +
+ Large-scale linear systems, $Ax=b$, frequently arise in practice and demand +effective iterative solvers. Often, these systems are noisy due to operational +errors or faulty data-collection processes. In the past decade, the randomized +Kaczmarz (RK) algorithm has been studied extensively as an efficient iterative +solver for such systems. However, the convergence study of RK in the noisy +regime is limited and considers measurement noise in the right-hand side +vector, $b$. Unfortunately, in practice, that is not always the case; the +coefficient matrix $A$ can also be noisy. In this paper, we analyze the +convergence of RK for noisy linear systems when the coefficient matrix, $A$, is +corrupted with both additive and multiplicative noise, along with the noisy +vector, $b$. In our analyses, the quantity $\tilde R=\| \tilde A^{\dagger} +\|_2^2 \|\tilde A \|_F^2$ influences the convergence of RK, where $\tilde A$ +represents a noisy version of $A$. We claim that our analysis is robust and +realistically applicable, as we do not require information about the noiseless +coefficient matrix, $A$, and considering different conditions on noise, we can +control the convergence of RK. We substantiate our theoretical findings by +performing comprehensive numerical experiments. + +
+
+
+
+
+ + ☆ Learning to Taste: A Multimodal Wine Dataset + + +
+ We present WineSensed, a large multimodal wine dataset for studying the +relations between visual perception, language, and flavor. The dataset +encompasses 897k images of wine labels and 824k reviews of wines curated from +the Vivino platform. It has over 350k unique vintages, annotated with year, +region, rating, alcohol percentage, price, and grape composition. We obtained +fine-grained flavor annotations on a subset by conducting a wine-tasting +experiment with 256 participants who were asked to rank wines based on their +similarity in flavor, resulting in more than 5k pairwise flavor distances. We +propose a low-dimensional concept embedding algorithm that combines human +experience with automatic machine similarity kernels. We demonstrate that this +shared concept embedding space improves upon separate embedding spaces for +coarse flavor classification (alcohol percentage, country, grape, price, +rating) and aligns with the intricate human perception of flavor. + +
+
+
+
+
+ + ☆ Transformers as Support Vector Machines + + +
+ Since its inception in "Attention Is All You Need", transformer architecture +has led to revolutionary advancements in NLP. The attention layer within the +transformer admits a sequence of input tokens $X$ and makes them interact +through pairwise similarities computed as softmax$(XQK^\top X^\top)$, where +$(K,Q)$ are the trainable key-query parameters. In this work, we establish a +formal equivalence between the optimization geometry of self-attention and a +hard-margin SVM problem that separates optimal input tokens from non-optimal +tokens using linear constraints on the outer-products of token pairs. This +formalism allows us to characterize the implicit bias of 1-layer transformers +optimized with gradient descent: (1) Optimizing the attention layer with +vanishing regularization, parameterized by $(K,Q)$, converges in direction to +an SVM solution minimizing the nuclear norm of the combined parameter +$W=KQ^\top$. Instead, directly parameterizing by $W$ minimizes a Frobenius norm +objective. We characterize this convergence, highlighting that it can occur +toward locally-optimal directions rather than global ones. (2) Complementing +this, we prove the local/global directional convergence of gradient descent +under suitable geometric conditions. Importantly, we show that +over-parameterization catalyzes global convergence by ensuring the feasibility +of the SVM problem and by guaranteeing a benign optimization landscape devoid +of stationary points. (3) While our theory applies primarily to linear +prediction heads, we propose a more general SVM equivalence that predicts the +implicit bias with nonlinear heads. Our findings are applicable to arbitrary +datasets and their validity is verified via experiments. We also introduce +several open problems and research directions. We believe these findings +inspire the interpretation of transformers as a hierarchy of SVMs that +separates and selects optimal tokens. + +
+
+
+
+
+ + ☆ PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic + Occupancy Prediction + + +
+ Semantic segmentation in autonomous driving has been undergoing an evolution +from sparse point segmentation to dense voxel segmentation, where the objective +is to predict the semantic occupancy of each voxel in the concerned 3D space. +The dense nature of the prediction space has rendered existing efficient +2D-projection-based methods (e.g., bird's eye view, range view, etc.) +ineffective, as they can only describe a subspace of the 3D scene. To address +this, we propose a cylindrical tri-perspective view to represent point clouds +effectively and comprehensively and a PointOcc model to process them +efficiently. Considering the distance distribution of LiDAR point clouds, we +construct the tri-perspective view in the cylindrical coordinate system for +more fine-grained modeling of nearer areas. We employ spatial group pooling to +maintain structural details during projection and adopt 2D backbones to +efficiently process each TPV plane. Finally, we obtain the features of each +point by aggregating its projected features on each of the processed TPV planes +without the need for any post-processing. Extensive experiments on both 3D +occupancy prediction and LiDAR segmentation benchmarks demonstrate that the +proposed PointOcc achieves state-of-the-art performance with much faster speed. +Specifically, despite only using LiDAR, PointOcc significantly outperforms all +other methods, including multi-modal methods, with a large margin on the +OpenOccupancy benchmark. Code: https://github.com/wzzheng/PointOcc. + +
+
+ comment: Code is available at https://github.com/wzzheng/PointOcc +
+
+
+
+
+ + ☆ Language-Conditioned Path Planning + + +
+ Contact is at the core of robotic manipulation. At times, it is desired (e.g. +manipulation and grasping), and at times, it is harmful (e.g. when avoiding +obstacles). However, traditional path planning algorithms focus solely on +collision-free paths, limiting their applicability in contact-rich tasks. To +address this limitation, we propose the domain of Language-Conditioned Path +Planning, where contact-awareness is incorporated into the path planning +problem. As a first step in this domain, we propose Language-Conditioned +Collision Functions (LACO) a novel approach that learns a collision function +using only a single-view image, language prompt, and robot configuration. LACO +predicts collisions between the robot and the environment, enabling flexible, +conditional path planning without the need for manual object annotations, point +cloud data, or ground-truth object meshes. In both simulation and the real +world, we demonstrate that LACO can facilitate complex, nuanced path plans that +allow for interaction with objects that are safe to collide, rather than +prohibiting any collision. + +
+
+ comment: Conference on Robot Learning, 2023 +
+
+
+
+
+ + ☆ GNFactor: Multi-Task Real Robot Learning with Generalizable Neural + Feature Fields + + +
+ It is a long-standing problem in robotics to develop agents capable of +executing diverse manipulation tasks from visual observations in unstructured +real-world environments. To achieve this goal, the robot needs to have a +comprehensive understanding of the 3D structure and semantics of the scene. In +this work, we present $\textbf{GNFactor}$, a visual behavior cloning agent for +multi-task robotic manipulation with $\textbf{G}$eneralizable $\textbf{N}$eural +feature $\textbf{F}$ields. GNFactor jointly optimizes a generalizable neural +field (GNF) as a reconstruction module and a Perceiver Transformer as a +decision-making module, leveraging a shared deep 3D voxel representation. To +incorporate semantics in 3D, the reconstruction module utilizes a +vision-language foundation model ($\textit{e.g.}$, Stable Diffusion) to distill +rich semantic information into the deep 3D voxel. We evaluate GNFactor on 3 +real robot tasks and perform detailed ablations on 10 RLBench tasks with a +limited number of demonstrations. We observe a substantial improvement of +GNFactor over current state-of-the-art methods in seen and unseen tasks, +demonstrating the strong generalization ability of GNFactor. Our project +website is https://yanjieze.com/GNFactor/ . + +
+
+ comment: CoRL 2023 Oral. Website: https://yanjieze.com/GNFactor/ +
+
+
+
+
+ + ☆ Federated Learning in UAV-Enhanced Networks: Joint Coverage and + Convergence Time Optimization + + +
+ Federated learning (FL) involves several devices that collaboratively train a +shared model without transferring their local data. FL reduces the +communication overhead, making it a promising learning method in UAV-enhanced +wireless networks with scarce energy resources. Despite the potential, +implementing FL in UAV-enhanced networks is challenging, as conventional UAV +placement methods that maximize coverage increase the FL delay significantly. +Moreover, the uncertainty and lack of a priori information about crucial +variables, such as channel quality, exacerbate the problem. In this paper, we +first analyze the statistical characteristics of a UAV-enhanced wireless sensor +network (WSN) with energy harvesting. We then develop a model and solution +based on the multi-objective multi-armed bandit theory to maximize the network +coverage while minimizing the FL delay. Besides, we propose another solution +that is particularly useful with large action sets and strict energy +constraints at the UAVs. Our proposal uses a scalarized best-arm identification +algorithm to find the optimal arms that maximize the ratio of the expected +reward to the expected energy cost by sequentially eliminating one or more arms +in each round. Then, we derive the upper bound on the error probability of our +multi-objective and cost-aware algorithm. Numerical results show the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ Prediction of Diblock Copolymer Morphology via Machine Learning + + +
+ A machine learning approach is presented to accelerate the computation of +block polymer morphology evolution for large domains over long timescales. The +strategy exploits the separation of characteristic times between coarse-grained +particle evolution on the monomer scale and slow morphological evolution over +mesoscopic scales. In contrast to empirical continuum models, the proposed +approach learns stochastically driven defect annihilation processes directly +from particle-based simulations. A UNet architecture that respects different +boundary conditions is adopted, thereby allowing periodic and fixed substrate +boundary conditions of arbitrary shape. Physical concepts are also introduced +via the loss function and symmetries are incorporated via data augmentation. +The model is validated using three different use cases. Explainable artificial +intelligence methods are applied to visualize the morphology evolution over +time. This approach enables the generation of large system sizes and long +trajectories to investigate defect densities and their evolution under +different types of confinement. As an application, we demonstrate the +importance of accessing late-stage morphologies for understanding particle +diffusion inside a single block. This work has implications for directed +self-assembly and materials design in micro-electronics, battery materials, and +membranes. + +
+
+ comment: 51 page, 11 Figures and 5 figures in the SI +
+
+
+
+
+ + ☆ The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 + Language Variants + + +
+ We present Belebele, a multiple-choice machine reading comprehension (MRC) +dataset spanning 122 language variants. Significantly expanding the language +coverage of natural language understanding (NLU) benchmarks, this dataset +enables the evaluation of text models in high-, medium-, and low-resource +languages. Each question is based on a short passage from the Flores-200 +dataset and has four multiple-choice answers. The questions were carefully +curated to discriminate between models with different levels of general +language comprehension. The English dataset on its own proves difficult enough +to challenge state-of-the-art language models. Being fully parallel, this +dataset enables direct comparison of model performance across all languages. We +use this dataset to evaluate the capabilities of multilingual masked language +models (MLMs) and large language models (LLMs). We present extensive results +and find that despite significant cross-lingual transfer in English-centric +LLMs, much smaller MLMs pretrained on balanced multilingual data still +understand far more languages. We also observe that larger vocabulary size and +conscious vocabulary construction correlate with better performance on +low-resource languages. Overall, Belebele opens up new avenues for evaluating +and analyzing the multilingual capabilities of NLP systems. + +
+
+ comment: 27 pages, 13 figures +
+
+
+
+
+ + ☆ Information Theoretically Optimal Sample Complexity of Learning + Dynamical Directed Acyclic Graphs + + +
+ In this article, the optimal sample complexity of learning the underlying +interaction/dependencies of a Linear Dynamical System (LDS) over a Directed +Acyclic Graph (DAG) is studied. The sample complexity of learning a DAG's +structure is well-studied for static systems, where the samples of nodal states +are independent and identically distributed (i.i.d.). However, such a study is +less explored for DAGs with dynamical systems, where the nodal states are +temporally correlated. We call such a DAG underlying an LDS as \emph{dynamical} +DAG (DDAG). In particular, we consider a DDAG where the nodal dynamics are +driven by unobserved exogenous noise sources that are wide-sense stationary +(WSS) in time but are mutually uncorrelated, and have the same {power spectral +density (PSD)}. Inspired by the static settings, a metric and an algorithm +based on the PSD matrix of the observed time series are proposed to reconstruct +the DDAG. The equal noise PSD assumption can be relaxed such that +identifiability conditions for DDAG reconstruction are not violated. For the +LDS with WSS (sub) Gaussian exogenous noise sources, it is shown that the +optimal sample complexity (or length of state trajectory) needed to learn the +DDAG is $n=\Theta(q\log(p/q))$, where $p$ is the number of nodes and $q$ is the +maximum number of parents per node. To prove the sample complexity upper bound, +a concentration bound for the PSD estimation is derived, under two different +sampling strategies. A matching min-max lower bound using generalized Fano's +inequality also is provided, thus showing the order optimality of the proposed +algorithm. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ Majorization-Minimization for sparse SVMs + + +
+ Several decades ago, Support Vector Machines (SVMs) were introduced for +performing binary classification tasks, under a supervised framework. Nowadays, +they often outperform other supervised methods and remain one of the most +popular approaches in the machine learning arena. In this work, we investigate +the training of SVMs through a smooth sparse-promoting-regularized squared +hinge loss minimization. This choice paves the way to the application of quick +training methods built on majorization-minimization approaches, benefiting from +the Lipschitz differentiabililty of the loss function. Moreover, the proposed +approach allows us to handle sparsity-preserving regularizers promoting the +selection of the most significant features, so enhancing the performance. +Numerical tests and comparisons conducted on three different datasets +demonstrate the good performance of the proposed methodology in terms of +qualitative metrics (accuracy, precision, recall, and F 1 score) as well as +computational cost. + +
+
+
+
+
+ + ☆ Natural Quantum Monte Carlo Computation of Excited States + + +
+ We present a variational Monte Carlo algorithm for estimating the lowest +excited states of a quantum system which is a natural generalization of the +estimation of ground states. The method has no free parameters and requires no +explicit orthogonalization of the different states, instead transforming the +problem of finding excited states of a given system into that of finding the +ground state of an expanded system. Expected values of arbitrary observables +can be calculated, including off-diagonal expectations between different states +such as the transition dipole moment. Although the method is entirely general, +it works particularly well in conjunction with recent work on using neural +networks as variational Ansatze for many-electron systems, and we show that by +combining this method with the FermiNet and Psiformer Ansatze we can accurately +recover vertical excitation energies and oscillator strengths on molecules as +large as benzene. Beyond the examples on molecules presented here, we expect +this technique will be of great interest for applications of variational +quantum Monte Carlo to atomic, nuclear and condensed matter physics. + +
+
+
+
+
+ + ☆ Diffusion Models for Interferometric Satellite Aperture Radar + + +
+ Probabilistic Diffusion Models (PDMs) have recently emerged as a very +promising class of generative models, achieving high performance in natural +image generation. However, their performance relative to non-natural images, +like radar-based satellite data, remains largely unknown. Generating large +amounts of synthetic (and especially labelled) satellite data is crucial to +implement deep-learning approaches for the processing and analysis of +(interferometric) satellite aperture radar data. Here, we leverage PDMs to +generate several radar-based satellite image datasets. We show that PDMs +succeed in generating images with complex and realistic structures, but that +sampling time remains an issue. Indeed, accelerated sampling strategies, which +work well on simple image datasets like MNIST, fail on our radar datasets. We +provide a simple and versatile open-source +https://github.com/thomaskerdreux/PDM_SAR_InSAR_generation to train, sample and +evaluate PDMs using any dataset on a single GPU. + +
+
+
+
+
+ + ☆ FedDD: Toward Communication-efficient Federated Learning with + Differential Parameter Dropout + + +
+ Federated Learning (FL) requires frequent exchange of model parameters, which +leads to long communication delay, especially when the network environments of +clients vary greatly. Moreover, the parameter server needs to wait for the +slowest client (i.e., straggler, which may have the largest model size, lowest +computing capability or worst network condition) to upload parameters, which +may significantly degrade the communication efficiency. Commonly-used client +selection methods such as partial client selection would lead to the waste of +computing resources and weaken the generalization of the global model. To +tackle this problem, along a different line, in this paper, we advocate the +approach of model parameter dropout instead of client selection, and +accordingly propose a novel framework of Federated learning scheme with +Differential parameter Dropout (FedDD). FedDD consists of two key modules: +dropout rate allocation and uploaded parameter selection, which will optimize +the model parameter uploading ratios tailored to different clients' +heterogeneous conditions and also select the proper set of important model +parameters for uploading subject to clients' dropout rate constraints. +Specifically, the dropout rate allocation is formulated as a convex +optimization problem, taking system heterogeneity, data heterogeneity, and +model heterogeneity among clients into consideration. The uploaded parameter +selection strategy prioritizes on eliciting important parameters for uploading +to speedup convergence. Furthermore, we theoretically analyze the convergence +of the proposed FedDD scheme. Extensive performance evaluations demonstrate +that the proposed FedDD scheme can achieve outstanding performances in both +communication efficiency and model convergence, and also possesses a strong +generalization capability to data of rare classes. + +
+
+
+
+
+ + ☆ Latent Variable Multi-output Gaussian Processes for Hierarchical + Datasets + + +
+ Multi-output Gaussian processes (MOGPs) have been introduced to deal with +multiple tasks by exploiting the correlations between different outputs. +Generally, MOGPs models assume a flat correlation structure between the +outputs. However, such a formulation does not account for more elaborate +relationships, for instance, if several replicates were observed for each +output (which is a typical setting in biological experiments). This paper +proposes an extension of MOGPs for hierarchical datasets (i.e. datasets for +which the relationships between observations can be represented within a tree +structure). Our model defines a tailored kernel function accounting for +hierarchical structures in the data to capture different levels of correlations +while leveraging the introduction of latent variables to express the underlying +dependencies between outputs through a dedicated kernel. This latter feature is +expected to significantly improve scalability as the number of tasks increases. +An extensive experimental study involving both synthetic and real-world data +from genomics and motion capture is proposed to support our claims. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Irregular Traffic Time Series Forecasting Based on Asynchronous + Spatio-Temporal Graph Convolutional Network + + +
+ Accurate traffic forecasting at intersections governed by intelligent traffic +signals is critical for the advancement of an effective intelligent traffic +signal control system. However, due to the irregular traffic time series +produced by intelligent intersections, the traffic forecasting task becomes +much more intractable and imposes three major new challenges: 1) asynchronous +spatial dependency, 2) irregular temporal dependency among traffic data, and 3) +variable-length sequence to be predicted, which severely impede the performance +of current traffic forecasting methods. To this end, we propose an Asynchronous +Spatio-tEmporal graph convolutional nEtwoRk (ASeer) to predict the traffic +states of the lanes entering intelligent intersections in a future time window. +Specifically, by linking lanes via a traffic diffusion graph, we first propose +an Asynchronous Graph Diffusion Network to model the asynchronous spatial +dependency between the time-misaligned traffic state measurements of lanes. +After that, to capture the temporal dependency within irregular traffic state +sequence, a learnable personalized time encoding is devised to embed the +continuous time for each lane. Then we propose a Transformable Time-aware +Convolution Network that learns meta-filters to derive time-aware convolution +filters with transformable filter sizes for efficient temporal convolution on +the irregular sequence. Furthermore, a Semi-Autoregressive Prediction Network +consisting of a state evolution unit and a semiautoregressive predictor is +designed to effectively and efficiently predict variable-length traffic state +sequences. Extensive experiments on two real-world datasets demonstrate the +effectiveness of ASeer in six metrics. + +
+
+
+
+
+ + ☆ Rank Collapse Causes Over-Smoothing and Over-Correlation in Graph Neural + Networks + + +
+ Our study reveals new theoretical insights into over-smoothing and feature +over-correlation in deep graph neural networks. We show the prevalence of +invariant subspaces, demonstrating a fixed relative behavior that is unaffected +by feature transformations. Our work clarifies recent observations related to +convergence to a constant state and a potential over-separation of node states, +as the amplification of subspaces only depends on the spectrum of the +aggregation function. In linear scenarios, this leads to node representations +being dominated by a low-dimensional subspace with an asymptotic convergence +rate independent of the feature transformations. This causes a rank collapse of +the node representations, resulting in over-smoothing when smooth vectors span +this subspace, and over-correlation even when over-smoothing is avoided. Guided +by our theory, we propose a sum of Kronecker products as a beneficial property +that can provably prevent over-smoothing, over-correlation, and rank collapse. +We empirically extend our insights to the non-linear case, demonstrating the +inability of existing models to capture linearly independent features. + +
+
+
+
+
+ + ☆ Joint Semantic-Native Communication and Inference via Minimal Simplicial + Structures + + +
+ In this work, we study the problem of semantic communication and inference, +in which a student agent (i.e. mobile device) queries a teacher agent (i.e. +cloud sever) to generate higher-order data semantics living in a simplicial +complex. Specifically, the teacher first maps its data into a k-order +simplicial complex and learns its high-order correlations. For effective +communication and inference, the teacher seeks minimally sufficient and +invariant semantic structures prior to conveying information. These minimal +simplicial structures are found via judiciously removing simplices selected by +the Hodge Laplacians without compromising the inference query accuracy. +Subsequently, the student locally runs its own set of queries based on a masked +simplicial convolutional autoencoder (SCAE) leveraging both local and remote +teacher's knowledge. Numerical results corroborate the effectiveness of the +proposed approach in terms of improving inference query accuracy under +different channel conditions and simplicial structures. Experiments on a +coauthorship dataset show that removing simplices by ranking the Laplacian +values yields a 85% reduction in payload size without sacrificing accuracy. +Joint semantic communication and inference by masked SCAE improves query +accuracy by 25% compared to local student based query and 15% compared to +remote teacher based query. Finally, incorporating channel semantics is shown +to effectively improve inference accuracy, notably at low SNR values. + +
+
+
+
+
+ + ☆ StratMed: Relevance Stratification for Low-resource Medication + Recommendation + + +
+ With the growing imbalance between limited medical resources and escalating +demands, AI-based clinical tasks have become paramount. Medication +recommendation, as a sub-domain, aims to amalgamate longitudinal patient +history with medical knowledge, assisting physicians in prescribing safer and +more accurate medication combinations. Existing methods overlook the inherent +long-tail distribution in medical data, lacking balanced representation between +head and tail data, which leads to sub-optimal model performance. To address +this challenge, we introduce StratMed, a model that incorporates an innovative +relevance stratification mechanism. It harmonizes discrepancies in data +long-tail distribution and strikes a balance between the safety and accuracy of +medication combinations. Specifically, we first construct a pre-training method +using deep learning networks to obtain entity representation. After that, we +design a pyramid-like data stratification method to obtain more generalized +entity relationships by reinforcing the features of unpopular entities. Based +on this relationship, we designed two graph structures to express medication +precision and safety at the same level to obtain visit representations. +Finally, the patient's historical clinical information is fitted to generate +medication combinations for the current health condition. Experiments on the +MIMIC-III dataset demonstrate that our method has outperformed current +state-of-the-art methods in four evaluation metrics (including safety and +accuracy). + +
+
+
+
+
+ + ☆ Efficacy of Neural Prediction-Based NAS for Zero-Shot NAS Paradigm + + +
+ In prediction-based Neural Architecture Search (NAS), performance indicators +derived from graph convolutional networks have shown significant success. These +indicators, achieved by representing feed-forward structures as component +graphs through one-hot encoding, face a limitation: their inability to evaluate +architecture performance across varying search spaces. In contrast, handcrafted +performance indicators (zero-shot NAS), which use the same architecture with +random initialization, can generalize across multiple search spaces. Addressing +this limitation, we propose a novel approach for zero-shot NAS using deep +learning. Our method employs Fourier sum of sines encoding for convolutional +kernels, enabling the construction of a computational feed-forward graph with a +structure similar to the architecture under evaluation. These encodings are +learnable and offer a comprehensive view of the architecture's topological +information. An accompanying multi-layer perceptron (MLP) then ranks these +architectures based on their encodings. Experimental results show that our +approach surpasses previous methods using graph convolutional networks in terms +of correlation on the NAS-Bench-201 dataset and exhibits a higher convergence +rate. Moreover, our extracted feature representation trained on each +NAS-Benchmark is transferable to other NAS-Benchmarks, showing promising +generalizability across multiple search spaces. The code is available at: +https://github.com/minh1409/DFT-NPZS-NAS + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Constructing Indoor Region-based Radio Map without Location Labels + + +
+ Radio map construction requires a large amount of radio measurement data with +location labels, which imposes a high deployment cost. This paper develops a +region-based radio map from received signal strength (RSS) measurements without +location labels. The construction is based on a set of blindly collected RSS +measurement data from a device that visits each region in an indoor area +exactly once, where the footprints and timestamps are not recorded. The main +challenge is to cluster the RSS data and match clusters with the physical +regions. Classical clustering algorithms fail to work as the RSS data naturally +appears as non-clustered due to multipaths and noise. In this paper, a signal +subspace model with a sequential prior is constructed for the RSS data, and an +integrated segmentation and clustering algorithm is developed, which is shown +to find the globally optimal solution in a special case. Furthermore, the +clustered data is matched with the physical regions using a graph-based +approach. Based on real measurements from an office space, the proposed scheme +reduces the region localization error by roughly 50% compared to a weighted +centroid localization (WCL) baseline, and it even outperforms some supervised +localization schemes, including k-nearest neighbor (KNN), support vector +machine (SVM), and deep neural network (DNN), which require labeled data for +training. + +
+
+
+
+
+ + ☆ Training Neural Networks Using Reproducing Kernel Space Interpolation + and Model Reduction + + +
+ We introduce and study the theory of training neural networks using +interpolation techniques from reproducing kernel Hilbert space theory. We +generalize the method to Krein spaces, and show that widely-used neural network +architectures are subsets of reproducing kernel Krein spaces (RKKS). We study +the concept of "associated Hilbert spaces" of RKKS and develop techniques to +improve upon the expressivity of various activation functions. Next, using +concepts from the theory of functions of several complex variables, we prove a +computationally applicable, multidimensional generalization of the celebrated +Adamjan- Arov-Krein (AAK) theorem. The theorem yields a novel class of neural +networks, called Prolongation Neural Networks (PNN). We demonstrate that, by +applying the multidimensional AAK theorem to gain a PNN, one can gain +performance superior to both our interpolatory methods and current +state-of-the-art methods in noisy environments. We provide useful illustrations +of our methods in practice. + +
+
+
+
+
+ + ☆ Moreau Envelope ADMM for Decentralized Weakly Convex Optimization + + +
+ This paper proposes a proximal variant of the alternating direction method of +multipliers (ADMM) for distributed optimization. Although the current versions +of ADMM algorithm provide promising numerical results in producing solutions +that are close to optimal for many convex and non-convex optimization problems, +it remains unclear if they can converge to a stationary point for weakly convex +and locally non-smooth functions. Through our analysis using the Moreau +envelope function, we demonstrate that MADM can indeed converge to a stationary +point under mild conditions. Our analysis also includes computing the bounds on +the amount of change in the dual variable update step by relating the gradient +of the Moreau envelope function to the proximal function. Furthermore, the +results of our numerical experiments indicate that our method is faster and +more robust than widely-used approaches. + +
+
+
+
+
+ + ☆ US-SFNet: A Spatial-Frequency Domain-based Multi-branch Network for + Cervical Lymph Node Lesions Diagnoses in Ultrasound Images + + +
+ Ultrasound imaging serves as a pivotal tool for diagnosing cervical lymph +node lesions. However, the diagnoses of these images largely hinge on the +expertise of medical practitioners, rendering the process susceptible to +misdiagnoses. Although rapidly developing deep learning has substantially +improved the diagnoses of diverse ultrasound images, there remains a +conspicuous research gap concerning cervical lymph nodes. The objective of our +work is to accurately diagnose cervical lymph node lesions by leveraging a deep +learning model. To this end, we first collected 3392 images containing normal +lymph nodes, benign lymph node lesions, malignant primary lymph node lesions, +and malignant metastatic lymph node lesions. Given that ultrasound images are +generated by the reflection and scattering of sound waves across varied bodily +tissues, we proposed the Conv-FFT Block. It integrates convolutional operations +with the fast Fourier transform to more astutely model the images. Building +upon this foundation, we designed a novel architecture, named US-SFNet. This +architecture not only discerns variances in ultrasound images from the spatial +domain but also adeptly captures microstructural alterations across various +lesions in the frequency domain. To ascertain the potential of US-SFNet, we +benchmarked it against 12 popular architectures through five-fold +cross-validation. The results show that US-SFNet is SOTA and can achieve 92.89% +accuracy, 90.46% precision, 89.95% sensitivity and 97.49% specificity, +respectively. + +
+
+
+
+
+ + ☆ Robust Networked Federated Learning for Localization + + +
+ This paper addresses the problem of localization, which is inherently +non-convex and non-smooth in a federated setting where the data is distributed +across a multitude of devices. Due to the decentralized nature of federated +environments, distributed learning becomes essential for scalability and +adaptability. Moreover, these environments are often plagued by outlier data, +which presents substantial challenges to conventional methods, particularly in +maintaining estimation accuracy and ensuring algorithm convergence. To mitigate +these challenges, we propose a method that adopts an $L_1$-norm robust +formulation within a distributed sub-gradient framework, explicitly designed to +handle these obstacles. Our approach addresses the problem in its original +form, without resorting to iterative simplifications or approximations, +resulting in enhanced computational efficiency and improved estimation +accuracy. We demonstrate that our method converges to a stationary point, +highlighting its effectiveness and reliability. Through numerical simulations, +we confirm the superior performance of our approach, notably in outlier-rich +environments, which surpasses existing state-of-the-art localization methods. + +
+
+
+
+
+ + ☆ Robust Representation Learning for Unreliable Partial Label Learning + + +
+ Partial Label Learning (PLL) is a type of weakly supervised learning where +each training instance is assigned a set of candidate labels, but only one +label is the ground-truth. However, this idealistic assumption may not always +hold due to potential annotation inaccuracies, meaning the ground-truth may not +be present in the candidate label set. This is known as Unreliable Partial +Label Learning (UPLL) that introduces an additional complexity due to the +inherent unreliability and ambiguity of partial labels, often resulting in a +sub-optimal performance with existing methods. To address this challenge, we +propose the Unreliability-Robust Representation Learning framework (URRL) that +leverages unreliability-robust contrastive learning to help the model fortify +against unreliable partial labels effectively. Concurrently, we propose a dual +strategy that combines KNN-based candidate label set correction and +consistency-regularization-based label disambiguation to refine label quality +and enhance the ability of representation learning within the URRL framework. +Extensive experiments demonstrate that the proposed method outperforms +state-of-the-art PLL methods on various datasets with diverse degrees of +unreliability and ambiguity. Furthermore, we provide a theoretical analysis of +our approach from the perspective of the expectation maximization (EM) +algorithm. Upon acceptance, we pledge to make the code publicly accessible. + +
+
+
+
+
+ + ☆ Everyone Can Attack: Repurpose Lossy Compression as a Natural Backdoor + Attack + + +
+ The vulnerabilities to backdoor attacks have recently threatened the +trustworthiness of machine learning models in practical applications. +Conventional wisdom suggests that not everyone can be an attacker since the +process of designing the trigger generation algorithm often involves +significant effort and extensive experimentation to ensure the attack's +stealthiness and effectiveness. Alternatively, this paper shows that there +exists a more severe backdoor threat: anyone can exploit an easily-accessible +algorithm for silent backdoor attacks. Specifically, this attacker can employ +the widely-used lossy image compression from a plethora of compression tools to +effortlessly inject a trigger pattern into an image without leaving any +noticeable trace; i.e., the generated triggers are natural artifacts. One does +not require extensive knowledge to click on the "convert" or "save as" button +while using tools for lossy image compression. Via this attack, the adversary +does not need to design a trigger generator as seen in prior works and only +requires poisoning the data. Empirically, the proposed attack consistently +achieves 100% attack success rate in several benchmark datasets such as MNIST, +CIFAR-10, GTSRB and CelebA. More significantly, the proposed attack can still +achieve almost 100% attack success rate with very small (approximately 10%) +poisoning rates in the clean label setting. The generated trigger of the +proposed attack using one lossy compression algorithm is also transferable +across other related compression algorithms, exacerbating the severity of this +backdoor threat. This work takes another crucial step toward understanding the +extensive risks of backdoor attacks in practice, urging practitioners to +investigate similar attacks and relevant backdoor mitigation methods. + +
+
+ comment: 14 pages. This paper shows everyone can mount a powerful and stealthy + backdoor attack with the widely-used lossy image compression +
+
+
+
+
+ + ☆ Everything, Everywhere All in One Evaluation: Using Multiverse Analysis + to Evaluate the Influence of Model Design Decisions on Algorithmic Fairness + + +
+ A vast number of systems across the world use algorithmic decision making +(ADM) to (partially) automate decisions that have previously been made by +humans. When designed well, these systems promise more objective decisions +while saving large amounts of resources and freeing up human time. However, +when ADM systems are not designed well, they can lead to unfair decisions which +discriminate against societal groups. The downstream effects of ADMs critically +depend on the decisions made during the systems' design and implementation, as +biases in data can be mitigated or reinforced along the modeling pipeline. Many +of these design decisions are made implicitly, without knowing exactly how they +will influence the final system. It is therefore important to make explicit the +decisions made during the design of ADM systems and understand how these +decisions affect the fairness of the resulting system. + To study this issue, we draw on insights from the field of psychology and +introduce the method of multiverse analysis for algorithmic fairness. In our +proposed method, we turn implicit design decisions into explicit ones and +demonstrate their fairness implications. By combining decisions, we create a +grid of all possible "universes" of decision combinations. For each of these +universes, we compute metrics of fairness and performance. Using the resulting +dataset, one can see how and which decisions impact fairness. We demonstrate +how multiverse analyses can be used to better understand variability and +robustness of algorithmic fairness using an exemplary case study of predicting +public health coverage of vulnerable populations for potential interventions. +Our results illustrate how decisions during the design of a machine learning +system can have surprising effects on its fairness and how to detect these +effects using multiverse analysis. + +
+
+
+
+
+ + ☆ Branches of a Tree: Taking Derivatives of Programs with Discrete and + Branching Randomness in High Energy Physics + + +
+ We propose to apply several gradient estimation techniques to enable the +differentiation of programs with discrete randomness in High Energy Physics. +Such programs are common in High Energy Physics due to the presence of +branching processes and clustering-based analysis. Thus differentiating such +programs can open the way for gradient based optimization in the context of +detector design optimization, simulator tuning, or data analysis and +reconstruction optimization. We discuss several possible gradient estimation +strategies, including the recent Stochastic AD method, and compare them in +simplified detector design experiments. In doing so we develop, to the best of +our knowledge, the first fully differentiable branching program. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Dynamic nsNet2: Efficient Deep Noise Suppression with Early Exiting SP 2023 + + +
+ Although deep learning has made strides in the field of deep noise +suppression, leveraging deep architectures on resource-constrained devices +still proved challenging. Therefore, we present an early-exiting model based on +nsNet2 that provides several levels of accuracy and resource savings by halting +computations at different stages. Moreover, we adapt the original architecture +by splitting the information flow to take into account the injected dynamism. +We show the trade-offs between performance and computational complexity based +on established metrics. + +
+
+ comment: Accepted at the MLSP 2023 +
+
+
+
+
+ + ☆ Communication-Efficient Decentralized Federated Learning via One-Bit + Compressive Sensing + + +
+ Decentralized federated learning (DFL) has gained popularity due to its +practicality across various applications. Compared to the centralized version, +training a shared model among a large number of nodes in DFL is more +challenging, as there is no central server to coordinate the training process. +Especially when distributed nodes suffer from limitations in communication or +computational resources, DFL will experience extremely inefficient and unstable +training. Motivated by these challenges, in this paper, we develop a novel +algorithm based on the framework of the inexact alternating direction method +(iADM). On one hand, our goal is to train a shared model with a sparsity +constraint. This constraint enables us to leverage one-bit compressive sensing +(1BCS), allowing transmission of one-bit information among neighbour nodes. On +the other hand, communication between neighbour nodes occurs only at certain +steps, reducing the number of communication rounds. Therefore, the algorithm +exhibits notable communication efficiency. Additionally, as each node selects +only a subset of neighbours to participate in the training, the algorithm is +robust against stragglers. Additionally, complex items are computed only once +for several consecutive steps and subproblems are solved inexactly using +closed-form solutions, resulting in high computational efficiency. Finally, +numerical experiments showcase the algorithm's effectiveness in both +communication and computation. + +
+
+
+
+
+ + ☆ What can we learn from quantum convolutional neural networks? + + +
+ We can learn from analyzing quantum convolutional neural networks (QCNNs) +that: 1) working with quantum data can be perceived as embedding physical +system parameters through a hidden feature map; 2) their high performance for +quantum phase recognition can be attributed to generation of a very suitable +basis set during the ground state embedding, where quantum criticality of spin +models leads to basis functions with rapidly changing features; 3) pooling +layers of QCNNs are responsible for picking those basis functions that can +contribute to forming a high-performing decision boundary, and the learning +process corresponds to adapting the measurement such that few-qubit operators +are mapped to full-register observables; 4) generalization of QCNN models +strongly depends on the embedding type, and that rotation-based feature maps +with the Fourier basis require careful feature engineering; 5) accuracy and +generalization of QCNNs with readout based on a limited number of shots favor +the ground state embeddings and associated physics-informed models. We +demonstrate these points in simulation, where our results shed light on +classification for physical processes, relevant for applications in sensing. +Finally, we show that QCNNs with properly chosen ground state embeddings can be +used for fluid dynamics problems, expressing shock wave solutions with good +generalization and proven trainability. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Autoencoder-based Online Data Quality Monitoring for the CMS + Electromagnetic Calorimeter + + +
+ The online Data Quality Monitoring system (DQM) of the CMS electromagnetic +calorimeter (ECAL) is a crucial operational tool that allows ECAL experts to +quickly identify, localize, and diagnose a broad range of detector issues that +would otherwise hinder physics-quality data taking. Although the existing ECAL +DQM system has been continuously updated to respond to new problems, it remains +one step behind newer and unforeseen issues. Using unsupervised deep learning, +a real-time autoencoder-based anomaly detection system is developed that is +able to detect ECAL anomalies unseen in past data. After accounting for spatial +variations in the response of the ECAL and the temporal evolution of anomalies, +the new system is able to efficiently detect anomalies while maintaining an +estimated false discovery rate between $10^{-2}$ to $10^{-4}$, beating existing +benchmarks by about two orders of magnitude. The real-world performance of the +system is validated using anomalies found in 2018 and 2022 LHC collision data. +Additionally, first results from deploying the autoencoder-based system in the +CMS online DQM workflow for the ECAL barrel during Run 3 of the LHC are +presented, showing its promising performance in detecting obscure issues that +could have been missed in the existing DQM system. + +
+
+ comment: Submitted to the Proceedings of 21st International Workshop on + Advanced Computing and Analysis Techniques in Physics Research ACAT 2022 + conference +
+
+
+
+
+ + ☆ Generate Your Own Scotland: Satellite Image Generation Conditioned on + Maps + + +
+ Despite recent advancements in image generation, diffusion models still +remain largely underexplored in Earth Observation. In this paper we show that +state-of-the-art pretrained diffusion models can be conditioned on cartographic +data to generate realistic satellite images. We provide two large datasets of +paired OpenStreetMap images and satellite views over the region of Mainland +Scotland and the Central Belt. We train a ControlNet model and qualitatively +evaluate the results, demonstrating that both image quality and map fidelity +are possible. Finally, we provide some insights on the opportunities and +challenges of applying these models for remote sensing. Our model weights and +code for creating the dataset are publicly available at +https://github.com/miquel-espinosa/map-sat. + +
+
+ comment: 13 pages, 6 figures. preprint +
+
+
+
+
+ + ☆ Towards Long-Tailed Recognition for Graph Classification via + Collaborative Experts + + +
+ Graph classification, aiming at learning the graph-level representations for +effective class assignments, has received outstanding achievements, which +heavily relies on high-quality datasets that have balanced class distribution. +In fact, most real-world graph data naturally presents a long-tailed form, +where the head classes occupy much more samples than the tail classes, it thus +is essential to study the graph-level classification over long-tailed data +while still remaining largely unexplored. However, most existing long-tailed +learning methods in visions fail to jointly optimize the representation +learning and classifier training, as well as neglect the mining of the +hard-to-classify classes. Directly applying existing methods to graphs may lead +to sub-optimal performance, since the model trained on graphs would be more +sensitive to the long-tailed distribution due to the complex topological +characteristics. Hence, in this paper, we propose a novel long-tailed +graph-level classification framework via Collaborative Multi-expert Learning +(CoMe) to tackle the problem. To equilibrate the contributions of head and tail +classes, we first develop balanced contrastive learning from the view of +representation learning, and then design an individual-expert classifier +training based on hard class mining. In addition, we execute gated fusion and +disentangled knowledge distillation among the multiple experts to promote the +collaboration in a multi-expert framework. Comprehensive experiments are +performed on seven widely-used benchmark datasets to demonstrate the +superiority of our method CoMe over state-of-the-art baselines. + +
+
+ comment: Accepted by IEEE Transactions on Big Data (TBD 2024) +
+
+
+
+
+ + ☆ A Causal Discovery Approach To Learn How Urban Form Shapes Sustainable + Mobility Across Continents + + +
+ Global sustainability requires low-carbon urban transport systems, shaped by +adequate infrastructure, deployment of low-carbon transport modes and shifts in +travel behavior. To adequately implement alterations in infrastructure, it's +essential to grasp the location-specific cause-and-effect mechanisms that the +constructed environment has on travel. Yet, current research falls short in +representing causal relationships between the 6D urban form variables and +travel, generalizing across different regions, and modeling urban form effects +at high spatial resolution. Here, we address all three gaps by utilizing a +causal discovery and an explainable machine learning framework to detect urban +form effects on intra-city travel based on high-resolution mobility data of six +cities across three continents. We show that both distance to city center, +demographics and density indirectly affect other urban form features. By +considering the causal relationships, we find that location-specific influences +align across cities, yet vary in magnitude. In addition, the spread of the city +and the coverage of jobs across the city are the strongest determinants of +travel-related emissions, highlighting the benefits of compact development and +associated benefits. Differences in urban form effects across the cities call +for a more holistic definition of 6D measures. Our work is a starting point for +location-specific analysis of urban form effects on mobility behavior using +causal discovery approaches, which is highly relevant for city planners and +municipalities across continents. + +
+
+ comment: 22 pages, 13 figures, 4 tables +
+
+
+
+
+ + ☆ Towards Optimal Patch Size in Vision Transformers for Tumor Segmentation + + +
+ Detection of tumors in metastatic colorectal cancer (mCRC) plays an essential +role in the early diagnosis and treatment of liver cancer. Deep learning models +backboned by fully convolutional neural networks (FCNNs) have become the +dominant model for segmenting 3D computerized tomography (CT) scans. However, +since their convolution layers suffer from limited kernel size, they are not +able to capture long-range dependencies and global context. To tackle this +restriction, vision transformers have been introduced to solve FCNN's locality +of receptive fields. Although transformers can capture long-range features, +their segmentation performance decreases with various tumor sizes due to the +model sensitivity to the input patch size. While finding an optimal patch size +improves the performance of vision transformer-based models on segmentation +tasks, it is a time-consuming and challenging procedure. This paper proposes a +technique to select the vision transformer's optimal input multi-resolution +image patch size based on the average volume size of metastasis lesions. We +further validated our suggested framework using a transfer-learning technique, +demonstrating that the highest Dice similarity coefficient (DSC) performance +was obtained by pre-training on training data with a larger tumour volume using +the suggested ideal patch size and then training with a smaller one. We +experimentally evaluate this idea through pre-training our model on a +multi-resolution public dataset. Our model showed consistent and improved +results when applied to our private multi-resolution mCRC dataset with a +smaller average tumor volume. This study lays the groundwork for optimizing +semantic segmentation of small objects using vision transformers. The +implementation source code is available +at:https://github.com/Ramtin-Mojtahedi/OVTPS. + +
+
+
+
+
+ + ☆ Towards Spontaneous Style Modeling with Semi-supervised Pre-training for + Conversational Text-to-Speech Synthesis INTERSPEECH 2023 + + +
+ The spontaneous behavior that often occurs in conversations makes speech more +human-like compared to reading-style. However, synthesizing spontaneous-style +speech is challenging due to the lack of high-quality spontaneous datasets and +the high cost of labeling spontaneous behavior. In this paper, we propose a +semi-supervised pre-training method to increase the amount of spontaneous-style +speech and spontaneous behavioral labels. In the process of semi-supervised +learning, both text and speech information are considered for detecting +spontaneous behaviors labels in speech. Moreover, a linguistic-aware encoder is +used to model the relationship between each sentence in the conversation. +Experimental results indicate that our proposed method achieves superior +expressive speech synthesis performance with the ability to model spontaneous +behavior in spontaneous-style speech and predict reasonable spontaneous +behavior from text. + +
+
+ comment: Accepted by INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Development and validation of an interpretable machine learning-based + calculator for predicting 5-year weight trajectories after bariatric surgery: + a multinational retrospective cohort SOPHIA study + + +
+ Background Weight loss trajectories after bariatric surgery vary widely +between individuals, and predicting weight loss before the operation remains +challenging. We aimed to develop a model using machine learning to provide +individual preoperative prediction of 5-year weight loss trajectories after +surgery. Methods In this multinational retrospective observational study we +enrolled adult participants (aged $\ge$18 years) from ten prospective cohorts +(including ABOS [NCT01129297], BAREVAL [NCT02310178], the Swedish Obese +Subjects study, and a large cohort from the Dutch Obesity Clinic [Nederlandse +Obesitas Kliniek]) and two randomised trials (SleevePass [NCT00793143] and +SM-BOSS [NCT00356213]) in Europe, the Americas, and Asia, with a 5 year +followup after Roux-en-Y gastric bypass, sleeve gastrectomy, or gastric band. +Patients with a previous history of bariatric surgery or large delays between +scheduled and actual visits were excluded. The training cohort comprised +patients from two centres in France (ABOS and BAREVAL). The primary outcome was +BMI at 5 years. A model was developed using least absolute shrinkage and +selection operator to select variables and the classification and regression +trees algorithm to build interpretable regression trees. The performances of +the model were assessed through the median absolute deviation (MAD) and root +mean squared error (RMSE) of BMI. Findings10 231 patients from 12 centres in +ten countries were included in the analysis, corresponding to 30 602 +patient-years. Among participants in all 12 cohorts, 7701 (75$\bullet$3%) were +female, 2530 (24$\bullet$7%) were male. Among 434 baseline attributes available +in the training cohort, seven variables were selected: height, weight, +intervention type, age, diabetes status, diabetes duration, and smoking status. +At 5 years, across external testing cohorts the overall mean MAD BMI was +2$\bullet$8 kg/m${}^2$ (95% CI 2$\bullet$6-3$\bullet$0) and mean RMSE BMI was +4$\bullet$7 kg/m${}^2$ (4$\bullet$4-5$\bullet$0), and the mean difference +between predicted and observed BMI was-0$\bullet$3 kg/m${}^2$ (SD 4$\bullet$7). +This model is incorporated in an easy to use and interpretable web-based +prediction tool to help inform clinical decision before surgery. +InterpretationWe developed a machine learning-based model, which is +internationally validated, for predicting individual 5-year weight loss +trajectories after three common bariatric interventions. + +
+
+ comment: The Lancet Digital Health, 2023 +
+
+
+
+
+ + ☆ CL-MAE: Curriculum-Learned Masked Autoencoders + + +
+ Masked image modeling has been demonstrated as a powerful pretext task for +generating robust representations that can be effectively generalized across +multiple downstream tasks. Typically, this approach involves randomly masking +patches (tokens) in input images, with the masking strategy remaining unchanged +during training. In this paper, we propose a curriculum learning approach that +updates the masking strategy to continually increase the complexity of the +self-supervised reconstruction task. We conjecture that, by gradually +increasing the task complexity, the model can learn more sophisticated and +transferable representations. To facilitate this, we introduce a novel +learnable masking module that possesses the capability to generate masks of +different complexities, and integrate the proposed module into masked +autoencoders (MAE). Our module is jointly trained with the MAE, while adjusting +its behavior during training, transitioning from a partner to the MAE +(optimizing the same reconstruction loss) to an adversary (optimizing the +opposite loss), while passing through a neutral state. The transition between +these behaviors is smooth, being regulated by a factor that is multiplied with +the reconstruction loss of the masking module. The resulting training procedure +generates an easy-to-hard curriculum. We train our Curriculum-Learned Masked +Autoencoder (CL-MAE) on ImageNet and show that it exhibits superior +representation learning capabilities compared to MAE. The empirical results on +five downstream tasks confirm our conjecture, demonstrating that curriculum +learning can be successfully used to self-supervise masked autoencoders. + +
+
+
+
+
+ + ☆ Document Layout Analysis on BaDLAD Dataset: A Comprehensive MViTv2 Based + Approach + + +
+ In the rapidly evolving digital era, the analysis of document layouts plays a +pivotal role in automated information extraction and interpretation. In our +work, we have trained MViTv2 transformer model architecture with cascaded mask +R-CNN on BaDLAD dataset to extract text box, paragraphs, images and tables from +a document. After training on 20365 document images for 36 epochs in a 3 phase +cycle, we achieved a training loss of 0.2125 and a mask loss of 0.19. Our work +extends beyond training, delving into the exploration of potential enhancement +avenues. We investigate the impact of rotation and flip augmentation, the +effectiveness of slicing input images pre-inference, the implications of +varying the resolution of the transformer backbone, and the potential of +employing a dual-pass inference to uncover missed text-boxes. Through these +explorations, we observe a spectrum of outcomes, where some modifications +result in tangible performance improvements, while others offer unique insights +for future endeavors. + +
+
+
+
+
+ + ☆ MONDEO: Multistage Botnet Detection + + +
+ Mobile devices have widespread to become the most used piece of technology. +Due to their characteristics, they have become major targets for botnet-related +malware. FluBot is one example of botnet malware that infects mobile devices. +In particular, FluBot is a DNS-based botnet that uses Domain Generation +Algorithms (DGA) to establish communication with the Command and Control Server +(C2). MONDEO is a multistage mechanism with a flexible design to detect +DNS-based botnet malware. MONDEO is lightweight and can be deployed without +requiring the deployment of software, agents, or configuration in mobile +devices, allowing easy integration in core networks. MONDEO comprises four +detection stages: Blacklisting/Whitelisting, Query rate analysis, DGA analysis, +and Machine learning evaluation. It was created with the goal of processing +streams of packets to identify attacks with high efficiency, in the distinct +phases. MONDEO was tested against several datasets to measure its efficiency +and performance, being able to achieve high performance with RandomForest +classifiers. The implementation is available at github. + +
+
+
+
+
+ + ☆ Forecasting Emergency Department Crowding with Advanced Machine Learning + Models and Multivariable Input + + +
+ Emergency department (ED) crowding is a significant threat to patient safety +and it has been repeatedly associated with increased mortality. Forecasting +future service demand has the potential patient outcomes. Despite active +research on the subject, several gaps remain: 1) proposed forecasting models +have become outdated due to quick influx of advanced machine learning models +(ML), 2) amount of multivariable input data has been limited and 3) discrete +performance metrics have been rarely reported. In this study, we document the +performance of a set of advanced ML models in forecasting ED occupancy 24 hours +ahead. We use electronic health record data from a large, combined ED with an +extensive set of explanatory variables, including the availability of beds in +catchment area hospitals, traffic data from local observation stations, weather +variables, etc. We show that N-BEATS and LightGBM outpeform benchmarks with 11 +% and 9 % respective improvements and that DeepAR predicts next day crowding +with an AUC of 0.76 (95 % CI 0.69-0.84). To the best of our knowledge, this is +the first study to document the superiority of LightGBM and N-BEATS over +statistical benchmarks in the context of ED forecasting. + +
+
+
+
+
+ + ☆ Scalable Incomplete Multi-View Clustering with Structure Alignment + + +
+ The success of existing multi-view clustering (MVC) relies on the assumption +that all views are complete. However, samples are usually partially available +due to data corruption or sensor malfunction, which raises the research of +incomplete multi-view clustering (IMVC). Although several anchor-based IMVC +methods have been proposed to process the large-scale incomplete data, they +still suffer from the following drawbacks: i) Most existing approaches neglect +the inter-view discrepancy and enforce cross-view representation to be +consistent, which would corrupt the representation capability of the model; ii) +Due to the samples disparity between different views, the learned anchor might +be misaligned, which we referred as the Anchor-Unaligned Problem for Incomplete +data (AUP-ID). Such the AUP-ID would cause inaccurate graph fusion and degrades +clustering performance. To tackle these issues, we propose a novel incomplete +anchor graph learning framework termed Scalable Incomplete Multi-View +Clustering with Structure Alignment (SIMVC-SA). Specially, we construct the +view-specific anchor graph to capture the complementary information from +different views. In order to solve the AUP-ID, we propose a novel structure +alignment module to refine the cross-view anchor correspondence. Meanwhile, the +anchor graph construction and alignment are jointly optimized in our unified +framework to enhance clustering quality. Through anchor graph construction +instead of full graphs, the time and space complexity of the proposed SIMVC-SA +is proven to be linearly correlated with the number of samples. Extensive +experiments on seven incomplete benchmark datasets demonstrate the +effectiveness and efficiency of our proposed method. Our code is publicly +available at https://github.com/wy1019/SIMVC-SA. + +
+
+
+
+
+ + ☆ On a Connection between Differential Games, Optimal Control, and + Energy-based Models for Multi-Agent Interactions ICML 2023 + + +
+ Game theory offers an interpretable mathematical framework for modeling +multi-agent interactions. However, its applicability in real-world robotics +applications is hindered by several challenges, such as unknown agents' +preferences and goals. To address these challenges, we show a connection +between differential games, optimal control, and energy-based models and +demonstrate how existing approaches can be unified under our proposed +Energy-based Potential Game formulation. Building upon this formulation, this +work introduces a new end-to-end learning application that combines neural +networks for game-parameter inference with a differentiable game-theoretic +optimization layer, acting as an inductive bias. The experiments using +simulated mobile robot pedestrian interactions and real-world automated driving +data provide empirical evidence that the game-theoretic layer improves the +predictive performance of various neural network backbones. + +
+
+ comment: International Conference on Machine Learning, Workshop on New + Frontiers in Learning, Control, and Dynamical Systems (ICML 2023 + Frontiers4LCD) +
+
+
+
+
+ + ☆ Conditioning Score-Based Generative Models by Neuro-Symbolic Constraints + + +
+ Score-based and diffusion models have emerged as effective approaches for +both conditional and unconditional generation. Still conditional generation is +based on either a specific training of a conditional model or classifier +guidance, which requires training a noise-dependent classifier, even when the +classifier for uncorrupted data is given. We propose an approach to sample from +unconditional score-based generative models enforcing arbitrary logical +constraints, without any additional training. Firstly, we show how to +manipulate the learned score in order to sample from an un-normalized +distribution conditional on a user-defined constraint. Then, we define a +flexible and numerically stable neuro-symbolic framework for encoding soft +logical constraints. Combining these two ingredients we obtain a general, but +approximate, conditional sampling algorithm. We further developed effective +heuristics aimed at improving the approximation. Finally, we show the +effectiveness of our approach for various types of constraints and data: +tabular data, images and time series. + +
+
+
+
+
+ + ☆ SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded + Objects + + +
+ To enable meaningful robotic manipulation of objects in the real-world, 6D +pose estimation is one of the critical aspects. Most existing approaches have +difficulties to extend predictions to scenarios where novel object instances +are continuously introduced, especially with heavy occlusions. In this work, we +propose a few-shot pose estimation (FSPE) approach called SA6D, which uses a +self-adaptive segmentation module to identify the novel target object and +construct a point cloud model of the target object using only a small number of +cluttered reference images. Unlike existing methods, SA6D does not require +object-centric reference images or any additional object information, making it +a more generalizable and scalable solution across categories. We evaluate SA6D +on real-world tabletop object datasets and demonstrate that SA6D outperforms +existing FSPE methods, particularly in cluttered scenes with occlusions, while +requiring fewer reference images. + +
+
+
+
+
+ + ☆ Curvature-based Pooling within Graph Neural Networks ECML + + +
+ Over-squashing and over-smoothing are two critical issues, that limit the +capabilities of graph neural networks (GNNs). While over-smoothing eliminates +the differences between nodes making them indistinguishable, over-squashing +refers to the inability of GNNs to propagate information over long distances, +as exponentially many node states are squashed into fixed-size representations. +Both phenomena share similar causes, as both are largely induced by the graph +topology. To mitigate these problems in graph classification tasks, we propose +CurvPool, a novel pooling method. CurvPool exploits the notion of curvature of +a graph to adaptively identify structures responsible for both over-smoothing +and over-squashing. By clustering nodes based on the Balanced Forman curvature, +CurvPool constructs a graph with a more suitable structure, allowing deeper +models and the combination of distant information. We compare it to other +state-of-the-art pooling approaches and establish its competitiveness in terms +of classification accuracy, computational complexity, and flexibility. CurvPool +outperforms several comparable methods across all considered tasks. The most +consistent results are achieved by pooling densely connected clusters using the +sum aggregation, as this allows additional information about the size of each +pool. + +
+
+ comment: ECMLPKDD 2023 - Workshop on Mining and Learning with Graphs +
+
+
+
+
+ + ☆ In-class Data Analysis Replications: Teaching Students while Testing + Science + + +
+ Science is facing a reproducibility crisis. Previous work has proposed +incorporating data analysis replications into classrooms as a potential +solution. However, despite the potential benefits, it is unclear whether this +approach is feasible, and if so, what the involved stakeholders-students, +educators, and scientists-should expect from it. Can students perform a data +analysis replication over the course of a class? What are the costs and +benefits for educators? And how can this solution help benchmark and improve +the state of science? + In the present study, we incorporated data analysis replications in the +project component of the Applied Data Analysis course (CS-401) taught at EPFL +(N=354 students). Here we report pre-registered findings based on surveys +administered throughout the course. First, we demonstrate that students can +replicate previously published scientific papers, most of them qualitatively +and some exactly. We find discrepancies between what students expect of data +analysis replications and what they experience by doing them along with changes +in expectations about reproducibility, which together serve as evidence of +attitude shifts to foster students' critical thinking. Second, we provide +information for educators about how much overhead is needed to incorporate +replications into the classroom and identify concerns that replications bring +as compared to more traditional assignments. Third, we identify tangible +benefits of the in-class data analysis replications for scientific communities, +such as a collection of replication reports and insights about replication +barriers in scientific work that should be avoided going forward. + Overall, we demonstrate that incorporating replication tasks into a large +data science class can increase the reproducibility of scientific work as a +by-product of data science instruction, thus benefiting both science and +students. + +
+
+
+
+
+ + ☆ Latent Painter + + +
+ Latent diffusers revolutionized the generative AI and inspired creative art. +When denoising the latent, the predicted original image at each step +collectively animates the formation. However, the animation is limited by the +denoising nature of the diffuser, and only renders a sharpening process. This +work presents Latent Painter, which uses the latent as the canvas, and the +diffuser predictions as the plan, to generate painting animation. Latent +Painter also transits one generated image to another, which can happen between +images from two different sets of checkpoints. + +
+
+
+
+
+ + ☆ Test-Time Adaptation for Point Cloud Upsampling Using Meta-Learning + + +
+ Affordable 3D scanners often produce sparse and non-uniform point clouds that +negatively impact downstream applications in robotic systems. While existing +point cloud upsampling architectures have demonstrated promising results on +standard benchmarks, they tend to experience significant performance drops when +the test data have different distributions from the training data. To address +this issue, this paper proposes a test-time adaption approach to enhance model +generality of point cloud upsampling. The proposed approach leverages +meta-learning to explicitly learn network parameters for test-time adaption. +Our method does not require any prior information about the test data. During +meta-training, the model parameters are learned from a collection of +instance-level tasks, each of which consists of a sparse-dense pair of point +clouds from the training data. During meta-testing, the trained model is +fine-tuned with a few gradient updates to produce a unique set of network +parameters for each test instance. The updated model is then used for the final +prediction. Our framework is generic and can be applied in a plug-and-play +manner with existing backbone networks in point cloud upsampling. Extensive +experiments demonstrate that our approach improves the performance of +state-of-the-art models. + +
+
+
+
+
+ + ☆ Echocardiographic View Classification with Integrated + Out-of-Distribution Detection for Enhanced Automatic Echocardiographic + Analysis + + +
+ In the rapidly evolving field of automatic echocardiographic analysis and +interpretation, automatic view classification is a critical yet challenging +task, owing to the inherent complexity and variability of echocardiographic +data. This study presents ECHOcardiography VIew Classification with +Out-of-Distribution dEtection (ECHO-VICODE), a novel deep learning-based +framework that effectively addresses this challenge by training to classify 31 +classes, surpassing previous studies and demonstrating its capacity to handle a +wide range of echocardiographic views. Furthermore, ECHO-VICODE incorporates an +integrated out-of-distribution (OOD) detection function, leveraging the +relative Mahalanobis distance to effectively identify 'near-OOD' instances +commonly encountered in echocardiographic data. Through extensive +experimentation, we demonstrated the outstanding performance of ECHO-VICODE in +terms of view classification and OOD detection, significantly reducing the +potential for errors in echocardiographic analyses. This pioneering study +significantly advances the domain of automated echocardiography analysis and +exhibits promising prospects for substantial applications in extensive clinical +research and practice. + +
+
+
+
+
+ + ☆ Point-TTA: Test-Time Adaptation for Point Cloud Registration Using + Multitask Meta-Auxiliary Learning + + +
+ We present Point-TTA, a novel test-time adaptation framework for point cloud +registration (PCR) that improves the generalization and the performance of +registration models. While learning-based approaches have achieved impressive +progress, generalization to unknown testing environments remains a major +challenge due to the variations in 3D scans. Existing methods typically train a +generic model and the same trained model is applied on each instance during +testing. This could be sub-optimal since it is difficult for the same model to +handle all the variations during testing. In this paper, we propose a test-time +adaptation approach for PCR. Our model can adapt to unseen distributions at +test-time without requiring any prior knowledge of the test data. Concretely, +we design three self-supervised auxiliary tasks that are optimized jointly with +the primary PCR task. Given a test instance, we adapt our model using these +auxiliary tasks and the updated model is used to perform the inference. During +training, our model is trained using a meta-auxiliary learning approach, such +that the adapted model via auxiliary tasks improves the accuracy of the primary +task. Experimental results demonstrate the effectiveness of our approach in +improving generalization of point cloud registration and outperforming other +state-of-the-art approaches. + +
+
+
+
+
+ + ☆ A Policy Adaptation Method for Implicit Multitask Reinforcement Learning + Problems + + +
+ In dynamic motion generation tasks, including contact and collisions, small +changes in policy parameters can lead to extremely different returns. For +example, in soccer, the ball can fly in completely different directions with a +similar heading motion by slightly changing the hitting position or the force +applied to the ball or when the friction of the ball varies. However, it is +difficult to imagine that completely different skills are needed for heading a +ball in different directions. In this study, we proposed a multitask +reinforcement learning algorithm for adapting a policy to implicit changes in +goals or environments in a single motion category with different reward +functions or physical parameters of the environment. We evaluated the proposed +method on the ball heading task using a monopod robot model. The results showed +that the proposed method can adapt to implicit changes in the goal positions or +the coefficients of restitution of the ball, whereas the standard domain +randomization approach cannot cope with different task settings. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ Domain-adaptive Message Passing Graph Neural Network + + +
+ Cross-network node classification (CNNC), which aims to classify nodes in a +label-deficient target network by transferring the knowledge from a source +network with abundant labels, draws increasing attention recently. To address +CNNC, we propose a domain-adaptive message passing graph neural network +(DM-GNN), which integrates graph neural network (GNN) with conditional +adversarial domain adaptation. DM-GNN is capable of learning informative +representations for node classification that are also transferrable across +networks. Firstly, a GNN encoder is constructed by dual feature extractors to +separate ego-embedding learning from neighbor-embedding learning so as to +jointly capture commonality and discrimination between connected nodes. +Secondly, a label propagation node classifier is proposed to refine each node's +label prediction by combining its own prediction and its neighbors' prediction. +In addition, a label-aware propagation scheme is devised for the labeled source +network to promote intra-class propagation while avoiding inter-class +propagation, thus yielding label-discriminative source embeddings. Thirdly, +conditional adversarial domain adaptation is performed to take the +neighborhood-refined class-label information into account during adversarial +domain adaptation, so that the class-conditional distributions across networks +can be better matched. Comparisons with eleven state-of-the-art methods +demonstrate the effectiveness of the proposed DM-GNN. + +
+
+
+
+
+ + ☆ Computing excited states of molecules using normalizing flows + + +
+ We present a new nonlinear variational framework for simultaneously computing +ground and excited states of quantum systems. Our approach is based on +approximating wavefunctions in the linear span of basis functions that are +augmented and optimized \emph{via} composition with normalizing flows. The +accuracy and efficiency of our approach are demonstrated in the calculations of +a large number of vibrational states of the triatomic H$_2$S molecule as well +as ground and several excited electronic states of prototypical one-electron +systems including the hydrogen atom, the molecular hydrogen ion, and a carbon +atom in a single-active-electron approximation. The results demonstrate +significant improvements in the accuracy of energy predictions and accelerated +basis-set convergence even when using normalizing flows with a small number of +parameters. The present approach can be also seen as the optimization of a set +of intrinsic coordinates that best capture the underlying physics within the +given basis set. + +
+
+
+
+
+ + ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained language models like ChatGPT have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks. Moreover, in bioinformatics, generating +functional programs poses additional notable challenges due to the amount of +domain knowledge, the need for complicated data operations, and intricate +functional dependencies between the operations. Here, we present BioCoder, a +benchmark developed to evaluate existing pre-trained models in generating +bioinformatics code. In relation to function-code generation, BioCoder covers +potential package dependencies, class declarations, and global variables. It +incorporates 1026 functions and 1243 methods in Python and Java from GitHub and +253 examples from the Rosalind Project. BioCoder incorporates a fuzz-testing +framework for evaluation, and we have applied it to evaluate many models +including InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, +InstructCodeT5+, and ChatGPT. Our detailed analysis of these models emphasizes +the importance of domain knowledge, pragmatic code generation, and contextual +understanding. Our dataset, benchmark, Docker images, and scripts required for +testing are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ☆ Least Squares Maximum and Weighted Generalization-Memorization Machines + + +
+ In this paper, we propose a new way of remembering by introducing a memory +influence mechanism for the least squares support vector machine (LSSVM). +Without changing the equation constraints of the original LSSVM, this +mechanism, allows an accurate partitioning of the training set without +overfitting. The maximum memory impact model (MIMM) and the weighted impact +memory model (WIMM) are then proposed. It is demonstrated that these models can +be degraded to the LSSVM. Furthermore, we propose some different memory impact +functions for the MIMM and WIMM. The experimental results show that that our +MIMM and WIMM have better generalization performance compared to the LSSVM and +significant advantage in time cost compared to other memory models. + +
+
+
+
+
+ + ☆ Adversarial Finetuning with Latent Representation Constraint to Mitigate + Accuracy-Robustness Tradeoff ICCV + + +
+ This paper addresses the tradeoff between standard accuracy on clean examples +and robustness against adversarial examples in deep neural networks (DNNs). +Although adversarial training (AT) improves robustness, it degrades the +standard accuracy, thus yielding the tradeoff. To mitigate this tradeoff, we +propose a novel AT method called ARREST, which comprises three components: (i) +adversarial finetuning (AFT), (ii) representation-guided knowledge distillation +(RGKD), and (iii) noisy replay (NR). AFT trains a DNN on adversarial examples +by initializing its parameters with a DNN that is standardly pretrained on +clean examples. RGKD and NR respectively entail a regularization term and an +algorithm to preserve latent representations of clean examples during AFT. RGKD +penalizes the distance between the representations of the standardly pretrained +and AFT DNNs. NR switches input adversarial examples to nonadversarial ones +when the representation changes significantly during AFT. By combining these +components, ARREST achieves both high standard accuracy and robustness. +Experimental results demonstrate that ARREST mitigates the tradeoff more +effectively than previous AT-based methods do. + +
+
+ comment: Accepted by International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Listen to Minority: Encrypted Traffic Classification for Class Imbalance + with Contrastive Pre-Training + + +
+ Mobile Internet has profoundly reshaped modern lifestyles in various aspects. +Encrypted Traffic Classification (ETC) naturally plays a crucial role in +managing mobile Internet, especially with the explosive growth of mobile apps +using encrypted communication. Despite some existing learning-based ETC methods +showing promising results, three-fold limitations still remain in real-world +network environments, 1) label bias caused by traffic class imbalance, 2) +traffic homogeneity caused by component sharing, and 3) training with reliance +on sufficient labeled traffic. None of the existing ETC methods can address all +these limitations. In this paper, we propose a novel Pre-trAining +Semi-Supervised ETC framework, dubbed PASS. Our key insight is to resample the +original train dataset and perform contrastive pre-training without using +individual app labels directly to avoid label bias issues caused by class +imbalance, while obtaining a robust feature representation to differentiate +overlapping homogeneous traffic by pulling positive traffic pairs closer and +pushing negative pairs away. Meanwhile, PASS designs a semi-supervised +optimization strategy based on pseudo-label iteration and dynamic loss +weighting algorithms in order to effectively utilize massive unlabeled traffic +data and alleviate manual train dataset annotation workload. PASS outperforms +state-of-the-art ETC methods and generic sampling approaches on four public +datasets with significant class imbalance and traffic homogeneity, remarkably +pushing the F1 of Cross-Platform215 with 1.31%, ISCX-17 with 9.12%. +Furthermore, we validate the generality of the contrastive pre-training and +pseudo-label iteration components of PASS, which can adaptively benefit ETC +methods with diverse feature extractors. + +
+
+ comment: Accepted by 2023 IEEE SECON, 9 pages, 6 figures +
+
+
+
+
+ + ☆ AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR + Prediction + + +
+ Click-through rate (CTR) prediction is a crucial issue in recommendation +systems. There has been an emergence of various public CTR datasets. However, +existing datasets primarily suffer from the following limitations. Firstly, +users generally click different types of items from multiple scenarios, and +modeling from multiple scenarios can provide a more comprehensive understanding +of users. Existing datasets only include data for the same type of items from a +single scenario. Secondly, multi-modal features are essential in multi-scenario +prediction as they address the issue of inconsistent ID encoding between +different scenarios. The existing datasets are based on ID features and lack +multi-modal features. Third, a large-scale dataset can provide a more reliable +evaluation of models, fully reflecting the performance differences between +models. The scale of existing datasets is around 100 million, which is +relatively small compared to the real-world CTR prediction. To address these +limitations, we propose AntM$^{2}$C, a Multi-Scenario Multi-Modal CTR dataset +based on industrial data from Alipay. Specifically, AntM$^{2}$C provides the +following advantages: 1) It covers CTR data of 5 different types of items, +providing insights into the preferences of users for different items, including +advertisements, vouchers, mini-programs, contents, and videos. 2) Apart from +ID-based features, AntM$^{2}$C also provides 2 multi-modal features, raw text +and image features, which can effectively establish connections between items +with different IDs. 3) AntM$^{2}$C provides 1 billion CTR data with 200 +features, including 200 million users and 6 million items. It is currently the +largest-scale CTR dataset available. Based on AntM$^{2}$C, we construct several +typical CTR tasks and provide comparisons with baseline methods. The dataset +homepage is available at https://www.atecup.cn/home. + +
+
+
+
+
+ + ☆ On the Equivalence between Implicit and Explicit Neural Networks: A + High-dimensional Viewpoint ICML + 2023 + + +
+ Implicit neural networks have demonstrated remarkable success in various +tasks. However, there is a lack of theoretical analysis of the connections and +differences between implicit and explicit networks. In this paper, we study +high-dimensional implicit neural networks and provide the high dimensional +equivalents for the corresponding conjugate kernels and neural tangent kernels. +Built upon this, we establish the equivalence between implicit and explicit +networks in high dimensions. + +
+
+ comment: Accepted by Workshop on High-dimensional Learning Dynamics, ICML + 2023, Honolulu, Hawaii +
+
+
+
+
+ + ☆ DECODE: DilatEd COnvolutional neural network for Detecting + Extreme-mass-ratio inspirals + + +
+ The detection of Extreme Mass Ratio Inspirals (EMRIs) is intricate due to +their complex waveforms, extended duration, and low signal-to-noise ratio +(SNR), making them more challenging to be identified compared to compact binary +coalescences. While matched filtering-based techniques are known for their +computational demands, existing deep learning-based methods primarily handle +time-domain data and are often constrained by data duration and SNR. In +addition, most existing work ignores time-delay interferometry (TDI) and +applies the long-wavelength approximation in detector response calculations, +thus limiting their ability to handle laser frequency noise. In this study, we +introduce DECODE, an end-to-end model focusing on EMRI signal detection by +sequence modeling in the frequency domain. Centered around a dilated causal +convolutional neural network, trained on synthetic data considering TDI-1.5 +detector response, DECODE can efficiently process a year's worth of +multichannel TDI data with an SNR of around 50. We evaluate our model on 1-year +data with accumulated SNR ranging from 50 to 120 and achieve a true positive +rate of 96.3% at a false positive rate of 1%, keeping an inference time of less +than 0.01 seconds. With the visualization of three showcased EMRI signals for +interpretability and generalization, DECODE exhibits strong potential for +future space-based gravitational wave data analyses. + +
+
+
+
+
+ + ☆ CktGNN: Circuit Graph Neural Network for Electronic Design Automation ICLR + + +
+ The electronic design automation of analog circuits has been a longstanding +challenge in the integrated circuit field due to the huge design space and +complex design trade-offs among circuit specifications. In the past decades, +intensive research efforts have mostly been paid to automate the transistor +sizing with a given circuit topology. By recognizing the graph nature of +circuits, this paper presents a Circuit Graph Neural Network (CktGNN) that +simultaneously automates the circuit topology generation and device sizing +based on the encoder-dependent optimization subroutines. Particularly, CktGNN +encodes circuit graphs using a two-level GNN framework (of nested GNN) where +circuits are represented as combinations of subgraphs in a known subgraph +basis. In this way, it significantly improves design efficiency by reducing the +number of subgraphs to perform message passing. Nonetheless, another critical +roadblock to advancing learning-assisted circuit design automation is a lack of +public benchmarks to perform canonical assessment and reproducible research. To +tackle the challenge, we introduce Open Circuit Benchmark (OCB), an +open-sourced dataset that contains $10$K distinct operational amplifiers with +carefully-extracted circuit specifications. OCB is also equipped with +communicative circuit generation and evaluation capabilities such that it can +help to generalize CktGNN to design various analog circuits by producing +corresponding datasets. Experiments on OCB show the extraordinary advantages of +CktGNN through representation-based optimization frameworks over other recent +powerful GNN baselines and human experts' manual designs. Our work paves the +way toward a learning-based open-sourced design automation for analog circuits. +Our source code is available at \url{https://github.com/zehao-dong/CktGNN}. + +
+
+ comment: Accepted by ICLR (International Conference on Learning + Representations) 2023 +
+
+
+
+
+ + ☆ Balancing between the Local and Global Structures (LGS) in Graph + Embedding + + +
+ We present a method for balancing between the Local and Global Structures +(LGS) in graph embedding, via a tunable parameter. Some embedding methods aim +to capture global structures, while others attempt to preserve local +neighborhoods. Few methods attempt to do both, and it is not always possible to +capture well both local and global information in two dimensions, which is +where most graph drawing live. The choice of using a local or a global +embedding for visualization depends not only on the task but also on the +structure of the underlying data, which may not be known in advance. For a +given graph, LGS aims to find a good balance between the local and global +structure to preserve. We evaluate the performance of LGS with synthetic and +real-world datasets and our results indicate that it is competitive with the +state-of-the-art methods, using established quality metrics such as stress and +neighborhood preservation. We introduce a novel quality metric, cluster +distance preservation, to assess intermediate structure capture. All +source-code, datasets, experiments and analysis are available online. + +
+
+ comment: Appears in the Proceedings of the 31st International Symposium on + Graph Drawing and Network Visualization (GD 2023) +
+
+
+
+
+ + ☆ Improving Robustness and Accuracy of Ponzi Scheme Detection on Ethereum + Using Time-Dependent Features + + +
+ The rapid development of blockchain has led to more and more funding pouring +into the cryptocurrency market, which also attracted cybercriminals' interest +in recent years. The Ponzi scheme, an old-fashioned fraud, is now popular on +the blockchain, causing considerable financial losses to many crypto-investors. +A few Ponzi detection methods have been proposed in the literature, most of +which detect a Ponzi scheme based on its smart contract source code or opcode. +The contract-code-based approach, while achieving very high accuracy, is not +robust: first, the source codes of a majority of contracts on Ethereum are not +available, and second, a Ponzi developer can fool a contract-code-based +detection model by obfuscating the opcode or inventing a new profit +distribution logic that cannot be detected (since these models were trained on +existing Ponzi logics only). A transaction-based approach could improve the +robustness of detection because transactions, unlike smart contracts, are +harder to be manipulated. However, the current transaction-based detection +models achieve fairly low accuracy. We address this gap in the literature by +developing new detection models that rely only on the transactions, hence +guaranteeing the robustness, and moreover, achieve considerably higher +Accuracy, Precision, Recall, and F1-score than existing transaction-based +models. This is made possible thanks to the introduction of novel +time-dependent features that capture Ponzi behaviours characteristics derived +from our comprehensive data analyses on Ponzi and non-Ponzi data from the +XBlock-ETH repository + +
+
+ comment: 17 pages, 9 figures, 4 tables +
+
+
+
+
+ + ☆ BenchTemp: A General Benchmark for Evaluating Temporal Graph Neural + Networks + + +
+ To handle graphs in which features or connectivities are evolving over time, +a series of temporal graph neural networks (TGNNs) have been proposed. Despite +the success of these TGNNs, the previous TGNN evaluations reveal several +limitations regarding four critical issues: 1) inconsistent datasets, 2) +inconsistent evaluation pipelines, 3) lacking workload diversity, and 4) +lacking efficient comparison. Overall, there lacks an empirical study that puts +TGNN models onto the same ground and compares them comprehensively. To this +end, we propose BenchTemp, a general benchmark for evaluating TGNN models on +various workloads. BenchTemp provides a set of benchmark datasets so that +different TGNN models can be fairly compared. Further, BenchTemp engineers a +standard pipeline that unifies the TGNN evaluation. With BenchTemp, we +extensively compare the representative TGNN models on different tasks (e.g., +link prediction and node classification) and settings (transductive and +inductive), w.r.t. both effectiveness and efficiency metrics. We have made +BenchTemp publicly available at https://github.com/qianghuangwhu/benchtemp. + +
+
+ comment: 28 pages, 23 figures, 27 tables. Submitted to the Conference on + Neural Information Processing Systems 2023 Track on Datasets and Benchmarks +
+
+
+
+
+ + ☆ Multi-Objective Decision Transformers for Offline Reinforcement Learning + + +
+ Offline Reinforcement Learning (RL) is structured to derive policies from +static trajectory data without requiring real-time environment interactions. +Recent studies have shown the feasibility of framing offline RL as a sequence +modeling task, where the sole aim is to predict actions based on prior context +using the transformer architecture. However, the limitation of this single task +learning approach is its potential to undermine the transformer model's +attention mechanism, which should ideally allocate varying attention weights +across different tokens in the input context for optimal prediction. To address +this, we reformulate offline RL as a multi-objective optimization problem, +where the prediction is extended to states and returns. We also highlight a +potential flaw in the trajectory representation used for sequence modeling, +which could generate inaccuracies when modeling the state and return +distributions. This is due to the non-smoothness of the action distribution +within the trajectory dictated by the behavioral policy. To mitigate this +issue, we introduce action space regions to the trajectory representation. Our +experiments on D4RL benchmark locomotion tasks reveal that our propositions +allow for more effective utilization of the attention mechanism in the +transformer model, resulting in performance that either matches or outperforms +current state-of-the art methods. + +
+
+
+
+
+ + ☆ A Survey on Privacy in Graph Neural Networks: Attacks, Preservation, and + Applications + + +
+ Graph Neural Networks (GNNs) have gained significant attention owing to their +ability to handle graph-structured data and the improvement in practical +applications. However, many of these models prioritize high utility +performance, such as accuracy, with a lack of privacy consideration, which is a +major concern in modern society where privacy attacks are rampant. To address +this issue, researchers have started to develop privacy-preserving GNNs. +Despite this progress, there is a lack of a comprehensive overview of the +attacks and the techniques for preserving privacy in the graph domain. In this +survey, we aim to address this gap by summarizing the attacks on graph data +according to the targeted information, categorizing the privacy preservation +techniques in GNNs, and reviewing the datasets and applications that could be +used for analyzing/solving privacy issues in GNNs. We also outline potential +directions for future research in order to build better privacy-preserving +GNNs. + +
+
+
+
+
+ + ☆ SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked + Prefills + + +
+ Large Language Model (LLM) inference consists of two distinct phases - +prefill phase which processes the input prompt and decode phase which generates +output tokens autoregressively. While the prefill phase effectively saturates +GPU compute at small batch sizes, the decode phase results in low compute +utilization as it generates one token at a time per request. The varying +prefill and decode times also lead to imbalance across micro-batches when using +pipeline parallelism, resulting in further inefficiency due to bubbles. + We present SARATHI to address these challenges. SARATHI employs +chunked-prefills, which splits a prefill request into equal sized chunks, and +decode-maximal batching, which constructs a batch using a single prefill chunk +and populates the remaining slots with decodes. During inference, the prefill +chunk saturates GPU compute, while the decode requests 'piggyback' and cost up +to an order of magnitude less compared to a decode-only batch. Chunked-prefills +allows constructing multiple decode-maximal batches from a single prefill +request, maximizing coverage of decodes that can piggyback. Furthermore, the +uniform compute design of these batches ameliorates the imbalance between +micro-batches, significantly reducing pipeline bubbles. + Our techniques yield significant improvements in inference performance across +models and hardware. For the LLaMA-13B model on A6000 GPU, SARATHI improves +decode throughput by up to 10x, and accelerates end-to-end throughput by up to +1.33x. For LLaMa-33B on A100 GPU, we achieve 1.25x higher end-to-end-throughput +and up to 4.25x higher decode throughput. When used with pipeline parallelism +on GPT-3, SARATHI reduces bubbles by 6.29x, resulting in an end-to-end +throughput improvement of 1.91x. + +
+
+
+
+
+ + ☆ RepCodec: A Speech Representation Codec for Speech Tokenization + + +
+ With recent rapid growth of large language models (LLMs), discrete speech +tokenization has played an important role for injecting speech into LLMs. +However, this discretization gives rise to a loss of information, consequently +impairing overall performance. To improve the performance of these discrete +speech tokens, we present RepCodec, a novel speech representation codec for +semantic speech tokenization. In contrast to audio codecs which reconstruct the +raw audio, RepCodec learns a vector quantization codebook through +reconstructing speech representations from speech encoders like HuBERT or +data2vec. Together, the speech encoder, the codec encoder and the vector +quantization codebook form a pipeline for converting speech waveforms into +semantic tokens. The extensive experiments illustrate that RepCodec, by virtue +of its enhanced information retention capacity, significantly outperforms the +widely used k-means clustering approach in both speech understanding and +generation. Furthermore, this superiority extends across various speech +encoders and languages, affirming the robustness of RepCodec. We believe our +method can facilitate large language modeling research on speech processing. + +
+
+
+
+
+ + ☆ Information Fusion for Assistance Systems in Production Assessment + + +
+ We propose a novel methodology to define assistance systems that rely on +information fusion to combine different sources of information while providing +an assessment. The main contribution of this paper is providing a general +framework for the fusion of n number of information sources using the evidence +theory. The fusion provides a more robust prediction and an associated +uncertainty that can be used to assess the prediction likeliness. Moreover, we +provide a methodology for the information fusion of two primary sources: an +ensemble classifier based on machine data and an expert-centered model. We +demonstrate the information fusion approach using data from an industrial +setup, which rounds up the application part of this research. Furthermore, we +address the problem of data drift by proposing a methodology to update the +data-based models using an evidence theory approach. We validate the approach +using the Benchmark Tennessee Eastman while doing an ablation study of the +model update parameters. + +
+
+ comment: 21 Pages, 10 Figures +
+
+
+
+
+ + ☆ TurboGP: A flexible and advanced python based GP library + + +
+ We introduce TurboGP, a Genetic Programming (GP) library fully written in +Python and specifically designed for machine learning tasks. TurboGP implements +modern features not available in other GP implementations, such as island and +cellular population schemes, different types of genetic operations (migration, +protected crossovers), online learning, among other features. TurboGP's most +distinctive characteristic is its native support for different types of GP +nodes to allow different abstraction levels, this makes TurboGP particularly +useful for processing a wide variety of data sources. + +
+
+
+
+
+ + ☆ Multi Agent DeepRL based Joint Power and Subchannel Allocation in IAB + networks CCS + + +
+ Integrated Access and Backhauling (IAB) is a viable approach for meeting the +unprecedented need for higher data rates of future generations, acting as a +cost-effective alternative to dense fiber-wired links. The design of such +networks with constraints usually results in an optimization problem of +non-convex and combinatorial nature. Under those situations, it is challenging +to obtain an optimal strategy for the joint Subchannel Allocation and Power +Allocation (SAPA) problem. In this paper, we develop a multi-agent Deep +Reinforcement Learning (DeepRL) based framework for joint optimization of power +and subchannel allocation in an IAB network to maximize the downlink data rate. +SAPA using DDQN (Double Deep Q-Learning Network) can handle computationally +expensive problems with huge action spaces associated with multiple users and +nodes. Unlike the conventional methods such as game theory, fractional +programming, and convex optimization, which in practice demand more and more +accurate network information, the multi-agent DeepRL approach requires less +environment network information. Simulation results show the proposed scheme's +promising performance when compared with baseline (Deep Q-Learning Network and +Random) schemes. + +
+
+ comment: 7 pages, 6 figures, Accepted at the European Conference on + Communication Systems (ECCS) 2023 +
+
+
+
+
+ + ☆ Improving vision-inspired keyword spotting using dynamic module skipping + in streaming conformer encoder + + +
+ Using a vision-inspired keyword spotting framework, we propose an +architecture with input-dependent dynamic depth capable of processing streaming +audio. Specifically, we extend a conformer encoder with trainable binary gates +that allow us to dynamically skip network modules according to the input audio. +Our approach improves detection and localization accuracy on continuous speech +using Librispeech top-1000 most frequent words while maintaining a small memory +footprint. The inclusion of gates also reduces the average amount of processing +without affecting the overall performance. These benefits are shown to be even +more pronounced using the Google speech commands dataset placed over background +noise where up to 97% of the processing is skipped on non-speech inputs, +therefore making our method particularly interesting for an always-on keyword +spotter. + +
+
+
+
+
+ + ☆ Predicting Financial Market Trends using Time Series Analysis and + Natural Language Processing + + +
+ Forecasting financial market trends through time series analysis and natural +language processing poses a complex and demanding undertaking, owing to the +numerous variables that can influence stock prices. These variables encompass a +spectrum of economic and political occurrences, as well as prevailing public +attitudes. Recent research has indicated that the expression of public +sentiments on social media platforms such as Twitter may have a noteworthy +impact on the determination of stock prices. The objective of this study was to +assess the viability of Twitter sentiments as a tool for predicting stock +prices of major corporations such as Tesla, Apple. Our study has revealed a +robust association between the emotions conveyed in tweets and fluctuations in +stock prices. Our findings indicate that positivity, negativity, and +subjectivity are the primary determinants of fluctuations in stock prices. The +data was analyzed utilizing the Long-Short Term Memory neural network (LSTM) +model, which is currently recognized as the leading methodology for predicting +stock prices by incorporating Twitter sentiments and historical stock prices +data. The models utilized in our study demonstrated a high degree of +reliability and yielded precise outcomes for the designated corporations. In +summary, this research emphasizes the significance of incorporating public +opinions into the prediction of stock prices. The application of Time Series +Analysis and Natural Language Processing methodologies can yield significant +scientific findings regarding financial market patterns, thereby facilitating +informed decision-making among investors. The results of our study indicate +that the utilization of Twitter sentiments can serve as a potent instrument for +forecasting stock prices, and ought to be factored in when formulating +investment strategies. + +
+
+
+
+
+ + ☆ FTA: Stealthy and Robust Backdoor Attack with Flexible Trigger on + Federated Learning + + +
+ Current backdoor attacks against federated learning (FL) strongly rely on +universal triggers or semantic patterns, which can be easily detected and +filtered by certain defense mechanisms such as norm clipping, comparing +parameter divergences among local updates. In this work, we propose a new +stealthy and robust backdoor attack with flexible triggers against FL defenses. +To achieve this, we build a generative trigger function that can learn to +manipulate the benign samples with an imperceptible flexible trigger pattern +and simultaneously make the trigger pattern include the most significant hidden +features of the attacker-chosen label. Moreover, our trigger generator can keep +learning and adapt across different rounds, allowing it to adjust to changes in +the global model. By filling the distinguishable difference (the mapping +between the trigger pattern and target label), we make our attack naturally +stealthy. Extensive experiments on real-world datasets verify the effectiveness +and stealthiness of our attack compared to prior attacks on decentralized +learning framework with eight well-studied defenses. + +
+
+
+
+
+ + ☆ Differentially Private Functional Summaries via the Independent + Component Laplace Process + + +
+ In this work, we propose a new mechanism for releasing differentially private +functional summaries called the Independent Component Laplace Process, or ICLP, +mechanism. By treating the functional summaries of interest as truly +infinite-dimensional objects and perturbing them with the ICLP noise, this new +mechanism relaxes assumptions on data trajectories and preserves higher utility +compared to classical finite-dimensional subspace embedding approaches in the +literature. We establish the feasibility of the proposed mechanism in multiple +function spaces. Several statistical estimation problems are considered, and we +demonstrate by slightly over-smoothing the summary, the privacy cost will not +dominate the statistical error and is asymptotically negligible. Numerical +experiments on synthetic and real datasets demonstrate the efficacy of the +proposed mechanism. + +
+
+
+
+
+ + ☆ Deep Semi-Supervised Anomaly Detection for Finding Fraud in the Futures + Market + + +
+ Modern financial electronic exchanges are an exciting and fast-paced +marketplace where billions of dollars change hands every day. They are also +rife with manipulation and fraud. Detecting such activity is a major +undertaking, which has historically been a job reserved exclusively for humans. +Recently, more research and resources have been focused on automating these +processes via machine learning and artificial intelligence. Fraud detection is +overwhelmingly associated with the greater field of anomaly detection, which is +usually performed via unsupervised learning techniques because of the lack of +labeled data needed for supervised learning. However, a small quantity of +labeled data does often exist. This research article aims to evaluate the +efficacy of a deep semi-supervised anomaly detection technique, called Deep +SAD, for detecting fraud in high-frequency financial data. We use exclusive +proprietary limit order book data from the TMX exchange in Montr\'eal, with a +small set of true labeled instances of fraud, to evaluate Deep SAD against its +unsupervised predecessor. We show that incorporating a small amount of labeled +data into an unsupervised anomaly detection framework can greatly improve its +accuracy. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ RePo: Resilient Model-Based Reinforcement Learning by Regularizing + Posterior Predictability + + +
+ Visual model-based RL methods typically encode image observations into +low-dimensional representations in a manner that does not eliminate redundant +information. This leaves them susceptible to spurious variations -- changes in +task-irrelevant components such as background distractors or lighting +conditions. In this paper, we propose a visual model-based RL method that +learns a latent representation resilient to such spurious variations. Our +training objective encourages the representation to be maximally predictive of +dynamics and reward, while constraining the information flow from the +observation to the latent representation. We demonstrate that this objective +significantly bolsters the resilience of visual model-based RL methods to +visual distractors, allowing them to operate in dynamic environments. We then +show that while the learned encoder is resilient to spirious variations, it is +not invariant under significant distribution shift. To address this, we propose +a simple reward-free alignment procedure that enables test time adaptation of +the encoder. This allows for quick adaptation to widely differing environments +without having to relearn the dynamics and policy. Our effort is a step towards +making model-based RL a practical and useful tool for dynamic, diverse domains. +We show its effectiveness in simulation benchmarks with significant spurious +variations as well as a real-world egocentric navigation task with noisy TVs in +the background. Videos and code at https://zchuning.github.io/repo-website/. + +
+
+
+
+
+ + ☆ On the Implicit Bias of Adam + + +
+ In previous literature, backward error analysis was used to find ordinary +differential equations (ODEs) approximating the gradient descent trajectory. It +was found that finite step sizes implicitly regularize solutions because terms +appearing in the ODEs penalize the two-norm of the loss gradients. We prove +that the existence of similar implicit regularization in RMSProp and Adam +depends on their hyperparameters and the training stage, but with a different +"norm" involved: the corresponding ODE terms either penalize the (perturbed) +one-norm of the loss gradients or, on the contrary, hinder its decrease (the +latter case being typical). We also conduct numerical experiments and discuss +how the proven facts can influence generalization. + +
+
+
+
+
+ + ☆ YaRN: Efficient Context Window Extension of Large Language Models + + +
+ Rotary Position Embeddings (RoPE) have been shown to effectively encode +positional information in transformer-based language models. However, these +models fail to generalize past the sequence length they were trained on. We +present YaRN (Yet another RoPE extensioN method), a compute-efficient method to +extend the context window of such models, requiring 10x less tokens and 2.5x +less training steps than previous methods. Using YaRN, we show that LLaMA +models can effectively utilize and extrapolate to context lengths much longer +than their original pre-training would allow, while also surpassing previous +the state-of-the-art at context window extension. In addition, we demonstrate +that YaRN exhibits the capability to extrapolate beyond the limited context of +a fine-tuning dataset. We publish the checkpoints of Llama 2 7B/13B fine-tuned +using YaRN with 64k and 128k context windows at +https://github.com/jquesnelle/yarn + +
+
+
+
+
+ + ♻ ☆ Diffusion Policies for Out-of-Distribution Generalization in Offline + Reinforcement Learning + + +
+ Offline Reinforcement Learning (RL) methods leverage previous experiences to +learn better policies than the behavior policy used for data collection. In +contrast to behavior cloning, which assumes the data is collected from expert +demonstrations, offline RL can work with non-expert data and multimodal +behavior policies. However, offline RL algorithms face challenges in handling +distribution shifts and effectively representing policies due to the lack of +online interaction during training. Prior work on offline RL uses conditional +diffusion models to represent multimodal behavior in the dataset. Nevertheless, +these methods are not tailored toward alleviating the out-of-distribution state +generalization. We introduce a novel method, named State Reconstruction for +Diffusion Policies (SRDP), incorporating state reconstruction feature learning +in the recent class of diffusion policies to address the out-of-distribution +generalization problem. State reconstruction loss promotes more descriptive +representation learning of states to alleviate the distribution shift incurred +by the out-of-distribution (OOD) states. We design a novel 2D Multimodal +Contextual Bandit environment to illustrate the OOD generalization of SRDP +compared to prior algorithms. In addition, we assess the performance of our +model on D4RL continuous control benchmarks, namely the navigation of an 8-DoF +ant and forward locomotion of half-cheetah, hopper, and walker2d, achieving +state-of-the-art results. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ StyleGAN as a Utility-Preserving Face De-identification Method + + +
+ Face de-identification methods have been proposed to preserve users' privacy +by obscuring their faces. These methods, however, can degrade the quality of +photos, and they usually do not preserve the utility of faces, i.e., their age, +gender, pose, and facial expression. Recently, GANs, such as StyleGAN, have +been proposed, which generate realistic, high-quality imaginary faces. In this +paper, we investigate the use of StyleGAN in generating de-identified faces +through style mixing. We examined this de-identification method for preserving +utility and privacy by implementing several face detection, verification, and +identification attacks and conducting a user study. The results from our +extensive experiments, human evaluation, and comparison with two +state-of-the-art methods, i.e., CIAGAN and DeepPrivacy, show that StyleGAN +performs on par or better than these methods, preserving users' privacy and +images' utility. In particular, the results of the machine learning-based +experiments show that StyleGAN0-4 preserves utility better than CIAGAN and +DeepPrivacy while preserving privacy at the same level. StyleGAN0-3 preserves +utility at the same level while providing more privacy. In this paper, for the +first time, we also performed a carefully designed user study to examine both +privacy and utility-preserving properties of StyleGAN0-3, 0-4, and 0-5, as well +as CIAGAN and DeepPrivacy from the human observers' perspectives. Our +statistical tests showed that participants tend to verify and identify +StyleGAN0-5 images more easily than DeepPrivacy images. All the methods but +StyleGAN0-5 had significantly lower identification rates than CIAGAN. Regarding +utility, as expected, StyleGAN0-5 performed significantly better in preserving +some attributes. Among all methods, on average, participants believe gender has +been preserved the most while naturalness has been preserved the least. + +
+
+
+
+
+ + ♻ ☆ Seeking Interpretability and Explainability in Binary Activated Neural + Networks + + +
+ We study the use of binary activated neural networks as interpretable and +explainable predictors in the context of regression tasks on tabular data; more +specifically, we provide guarantees on their expressiveness, present an +approach based on the efficient computation of SHAP values for quantifying the +relative importance of the features, hidden neurons and even weights. As the +model's simplicity is instrumental in achieving interpretability, we propose a +greedy algorithm for building compact binary activated networks. This approach +doesn't need to fix an architecture for the network in advance: it is built one +layer at a time, one neuron at a time, leading to predictors that aren't +needlessly complex for a given task. + +
+
+
+
+
+ + ♻ ☆ Dynamical systems' based neural networks + + +
+ Neural networks have gained much interest because of their effectiveness in +many applications. However, their mathematical properties are generally not +well understood. If there is some underlying geometric structure inherent to +the data or to the function to approximate, it is often desirable to take this +into account in the design of the neural network. In this work, we start with a +non-autonomous ODE and build neural networks using a suitable, +structure-preserving, numerical time-discretisation. The structure of the +neural network is then inferred from the properties of the ODE vector field. +Besides injecting more structure into the network architectures, this modelling +procedure allows a better theoretical understanding of their behaviour. We +present two universal approximation results and demonstrate how to impose some +particular properties on the neural networks. A particular focus is on +1-Lipschitz architectures including layers that are not 1-Lipschitz. These +networks are expressive and robust against adversarial attacks, as shown for +the CIFAR-10 and CIFAR-100 datasets. + +
+
+
+
+
+ + ♻ ☆ Hypergraph Structure Inference From Data Under Smoothness Prior + + +
+ Hypergraphs are important for processing data with higher-order relationships +involving more than two entities. In scenarios where explicit hypergraphs are +not readily available, it is desirable to infer a meaningful hypergraph +structure from the node features to capture the intrinsic relations within the +data. However, existing methods either adopt simple pre-defined rules that fail +to precisely capture the distribution of the potential hypergraph structure, or +learn a mapping between hypergraph structures and node features but require a +large amount of labelled data, i.e., pre-existing hypergraph structures, for +training. Both restrict their applications in practical scenarios. To fill this +gap, we propose a novel smoothness prior that enables us to design a method to +infer the probability for each potential hyperedge without labelled data as +supervision. The proposed prior indicates features of nodes in a hyperedge are +highly correlated by the features of the hyperedge containing them. We use this +prior to derive the relation between the hypergraph structure and the node +features via probabilistic modelling. This allows us to develop an unsupervised +inference method to estimate the probability for each potential hyperedge via +solving an optimisation problem that has an analytical solution. Experiments on +both synthetic and real-world data demonstrate that our method can learn +meaningful hypergraph structures from data more efficiently than existing +hypergraph structure inference methods. + +
+
+
+
+
+ + ♻ ☆ Speeding up Fourier Neural Operators via Mixed Precision + + +
+ The Fourier neural operator (FNO) is a powerful technique for learning +surrogate maps for partial differential equation (PDE) solution operators. For +many real-world applications, which often require high-resolution data points, +training time and memory usage are significant bottlenecks. While there are +mixed-precision training techniques for standard neural networks, those work +for real-valued datatypes on finite dimensions and therefore cannot be directly +applied to FNO, which crucially operates in the (complex-valued) Fourier domain +and in function spaces. On the other hand, since the Fourier transform is +already an approximation (due to discretization error), we do not need to +perform the operation at full precision. In this work, we (i) profile memory +and runtime for FNO with full and mixed-precision training, (ii) conduct a +study on the numerical stability of mixed-precision training of FNO, and (iii) +devise a training routine which substantially decreases training time and +memory usage (up to 34%), with little or no reduction in accuracy, on the +Navier-Stokes and Darcy flow equations. Combined with the recently proposed +tensorized FNO (Kossaifi et al., 2023), the resulting model has far better +performance while also being significantly faster than the original FNO. + +
+
+
+
+
+ + ♻ ☆ Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave + Communications + + +
+ This study demonstrates the feasibility of point cloud-based proactive link +quality prediction for millimeter-wave (mmWave) communications. Previous +studies have proposed machine learning-based methods to predict received signal +strength for future time periods using time series of depth images to mitigate +the line-of-sight (LOS) path blockage by pedestrians in mmWave communication. +However, these image-based methods have limited applicability due to privacy +concerns as camera images may contain sensitive information. This study +proposes a point cloud-based method for mmWave link quality prediction and +demonstrates its feasibility through experiments. Point clouds represent +three-dimensional (3D) spaces as a set of points and are sparser and less +likely to contain sensitive information than camera images. Additionally, point +clouds provide 3D position and motion information, which is necessary for +understanding the radio propagation environment involving pedestrians. This +study designs the mmWave link quality prediction method and conducts realistic +indoor experiments, where the link quality fluctuates significantly due to +human blockage, using commercially available IEEE 802.11ad-based 60 GHz +wireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light +detection and ranging (LiDAR) for point cloud acquisition. The experimental +results showed that our proposed method can predict future large attenuation of +mmWave received signal strength and throughput induced by the LOS path blockage +by pedestrians with comparable or superior accuracy to image-based prediction +methods. Hence, our point cloud-based method can serve as a viable alternative +to image-based methods. + +
+
+ comment: Submitted to IEEE Transactions on Machine Learning in Communications + and Networking +
+
+
+
+
+ + ♻ ☆ Metropolitan Segment Traffic Speeds from Massive Floating Car Data in 10 + Cities + + +
+ Traffic analysis is crucial for urban operations and planning, while the +availability of dense urban traffic data beyond loop detectors is still scarce. +We present a large-scale floating vehicle dataset of per-street segment traffic +information, Metropolitan Segment Traffic Speeds from Massive Floating Car Data +in 10 Cities (MeTS-10), available for 10 global cities with a 15-minute +resolution for collection periods ranging between 108 and 361 days in 2019-2021 +and covering more than 1500 square kilometers per metropolitan area. MeTS-10 +features traffic speed information at all street levels from main arterials to +local streets for Antwerp, Bangkok, Barcelona, Berlin, Chicago, Istanbul, +London, Madrid, Melbourne and Moscow. The dataset leverages the +industrial-scale floating vehicle Traffic4cast data with speeds and vehicle +counts provided in a privacy-preserving spatio-temporal aggregation. We detail +the efficient matching approach mapping the data to the OpenStreetMap road +graph. We evaluate the dataset by comparing it with publicly available +stationary vehicle detector data (for Berlin, London, and Madrid) and the Uber +traffic speed dataset (for Barcelona, Berlin, and London). The comparison +highlights the differences across datasets in spatio-temporal coverage and +variations in the reported traffic caused by the binning method. MeTS-10 +enables novel, city-wide analysis of mobility and traffic patterns for ten +major world cities, overcoming current limitations of spatially sparse vehicle +detector data. The large spatial and temporal coverage offers an opportunity +for joining the MeTS-10 with other datasets, such as traffic surveys in traffic +planning studies or vehicle detector data in traffic control settings. + +
+
+ comment: Accepted by IEEE Transactions on Intelligent Transportation Systems + (T-ITS), DOI: https://doi.org/10.1109/TITS.2023.3291737 +
+
+
+
+
+ + ♻ ☆ Neural Mixed Effects for Nonlinear Personalized Predictions + + +
+ Personalized prediction is a machine learning approach that predicts a +person's future observations based on their past labeled observations and is +typically used for sequential tasks, e.g., to predict daily mood ratings. When +making personalized predictions, a model can combine two types of trends: (a) +trends shared across people, i.e., person-generic trends, such as being happier +on weekends, and (b) unique trends for each person, i.e., person-specific +trends, such as a stressful weekly meeting. Mixed effect models are popular +statistical models to study both trends by combining person-generic and +person-specific parameters. Though linear mixed effect models are gaining +popularity in machine learning by integrating them with neural networks, these +integrations are currently limited to linear person-specific parameters: ruling +out nonlinear person-specific trends. In this paper, we propose Neural Mixed +Effect (NME) models to optimize nonlinear person-specific parameters anywhere +in a neural network in a scalable manner. NME combines the efficiency of neural +network optimization with nonlinear mixed effects modeling. Empirically, we +observe that NME improves performance across six unimodal and multimodal +datasets, including a smartphone dataset to predict daily mood and a +mother-adolescent dataset to predict affective state sequences where half the +mothers experience at least moderate symptoms of depression. Furthermore, we +evaluate NME for two model architectures, including for neural conditional +random fields (CRF) to predict affective state sequences where the CRF learns +nonlinear person-specific temporal transitions between affective states. +Analysis of these person-specific transitions on the mother-adolescent dataset +shows interpretable trends related to the mother's depression symptoms. + +
+
+ comment: camera-ready version +
+
+
+
+
+ + ♻ ☆ Neural ShDF: Reviving an Efficient and Consistent Mesh Segmentation + Method SIGGRAPH 2023 + + +
+ Partitioning a polygonal mesh into meaningful parts can be challenging. Many +applications require decomposing such structures for further processing in +computer graphics. In the last decade, several methods were proposed to tackle +this problem, at the cost of intensive computational times. Recently, machine +learning has proven to be effective for the segmentation task on 3D structures. +Nevertheless, these state-of-the-art methods are often hardly generalizable and +require dividing the learned model into several specific classes of objects to +avoid overfitting. We present a data-driven approach leveraging deep learning +to encode a mapping function prior to mesh segmentation for multiple +applications. Our network reproduces a neighborhood map using our knowledge of +the \textsl{Shape Diameter Function} (SDF) method using similarities among +vertex neighborhoods. Our approach is resolution-agnostic as we downsample the +input meshes and query the full-resolution structure solely for neighborhood +contributions. Using our predicted SDF values, we can inject the resulting +structure into a graph-cut algorithm to generate an efficient and robust mesh +segmentation while considerably reducing the required computation times. + +
+
+ comment: 9 pages, 13 figures, and 3 tables. Short paper and poster published + and presented at SIGGRAPH 2023 +
+
+
+
+
+ + ♻ ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks, such as Reddit discussions. In contrast to traditional comment-only +methods, our approach to labelling a comment as hate speech involves a holistic +analysis of text and images grounded in the discussion context. This is done by +leveraging graph transformers to capture the contextual relationships in the +entire discussion surrounding a comment and grounding the interwoven fusion +layers that combine individual comments' text and image embeddings instead of +processing modalities separately. We compare the performance of our model to +baselines that only process individual comments and conduct extensive ablation +studies. To evaluate our work, we present a new dataset, HatefulDiscussions, +comprising complete multi-modal discussions from multiple online communities on +Reddit. We conclude with future work for multimodal solutions to deliver social +value in online contexts, arguing that capturing a holistic view of a +conversation significantly advances the effort to detect anti-social behaviour. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ♻ ☆ Biclustering Methods via Sparse Penalty + + +
+ In this paper, we first reviewed several biclustering methods that are used +to identify the most significant clusters in gene expression data. Here we +mainly focused on the SSVD(sparse SVD) method and tried a new sparse penalty +named "Prenet penalty" which has been used only in factor analysis to gain +sparsity. Then in the simulation study, we tried different types of generated +datasets (with different sparsity and dimension) and tried 1-layer +approximation then for k-layers which shows the mixed Prenet penalty is very +effective for non-overlapped data. Finally, we used some real gene expression +data to show the behavior of our methods. + +
+
+ comment: This research it still in progress and need to fix some issues +
+
+
+
+
+ + ♻ ☆ Combining Inductive and Deductive Reasoning for Query Answering over + Incomplete Knowledge Graphs + + +
+ Current methods for embedding-based query answering over incomplete Knowledge +Graphs (KGs) only focus on inductive reasoning, i.e., predicting answers by +learning patterns from the data, and lack the complementary ability to do +deductive reasoning, which requires the application of domain knowledge to +infer further information. To address this shortcoming, we investigate the +problem of incorporating ontologies into embedding-based query answering models +by defining the task of embedding-based ontology-mediated query answering. We +propose various integration strategies into prominent representatives of +embedding models that involve (1) different ontology-driven data augmentation +techniques and (2) adaptation of the loss function to enforce the ontology +axioms. We design novel benchmarks for the considered task based on the LUBM +and the NELL KGs and evaluate our methods on them. The achieved improvements in +the setting that requires both inductive and deductive reasoning are from 20% +to 55% in HITS@3. + +
+
+
+
+
+ + ♻ ☆ Learning Delays in Spiking Neural Networks using Dilated Convolutions + with Learnable Spacings + + +
+ Spiking Neural Networks (SNNs) are a promising research direction for +building power-efficient information processing systems, especially for +temporal tasks such as speech recognition. In SNNs, delays refer to the time +needed for one spike to travel from one neuron to another. These delays matter +because they influence the spike arrival times, and it is well-known that +spiking neurons respond more strongly to coincident input spikes. More +formally, it has been shown theoretically that plastic delays greatly increase +the expressivity in SNNs. Yet, efficient algorithms to learn these delays have +been lacking. Here, we propose a new discrete-time algorithm that addresses +this issue in deep feedforward SNNs using backpropagation, in an offline +manner. To simulate delays between consecutive layers, we use 1D convolutions +across time. The kernels contain only a few non-zero weights - one per synapse +- whose positions correspond to the delays. These positions are learned +together with the weights using the recently proposed Dilated Convolution with +Learnable Spacings (DCLS). We evaluated our method on three datasets: the +Spiking Heidelberg Dataset (SHD), the Spiking Speech Commands (SSC) and its +non-spiking version Google Speech Commands v0.02 (GSC) benchmarks, which +require detecting temporal patterns. We used feedforward SNNs with two or three +hidden fully connected layers, and vanilla leaky integrate-and fire neurons. We +showed that fixed random delays help and that learning them helps even more. +Furthermore, our method outperformed the state-of-the-art in the three datasets +without using recurrent connections and with substantially fewer parameters. +Our work demonstrates the potential of delay learning in developing accurate +and precise models for temporal data processing. Our code is based on PyTorch / +SpikingJelly and available at: https://github.com/Thvnvtos/SNN-delays + +
+
+
+
+
+ + ♻ ☆ Transformers Meet Directed Graphs + + +
+ Transformers were originally proposed as a sequence-to-sequence model for +text but have become vital for a wide range of modalities, including images, +audio, video, and undirected graphs. However, transformers for directed graphs +are a surprisingly underexplored topic, despite their applicability to +ubiquitous domains, including source code and logic circuits. In this work, we +propose two direction- and structure-aware positional encodings for directed +graphs: (1) the eigenvectors of the Magnetic Laplacian - a direction-aware +generalization of the combinatorial Laplacian; (2) directional random walk +encodings. Empirically, we show that the extra directionality information is +useful in various downstream tasks, including correctness testing of sorting +networks and source code understanding. Together with a data-flow-centric graph +construction, our model outperforms the prior state of the art on the Open +Graph Benchmark Code2 relatively by 14.7%. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ♻ ☆ Adaptive Uncertainty-Guided Model Selection for Data-Driven PDE + Discovery + + +
+ We propose a new parameter-adaptive uncertainty-penalized Bayesian +information criterion (UBIC) to prioritize the parsimonious partial +differential equation (PDE) that sufficiently governs noisy spatial-temporal +observed data with few reliable terms. Since the naive use of the BIC for model +selection has been known to yield an undesirable overfitted PDE, the UBIC +penalizes the found PDE not only by its complexity but also the quantified +uncertainty, derived from the model supports' coefficient of variation in a +probabilistic view. We also introduce physics-informed neural network learning +as a simulation-based approach to further validate the selected PDE flexibly +against the other discovered PDE. Numerical results affirm the successful +application of the UBIC in identifying the true governing PDE. Additionally, we +reveal an interesting effect of denoising the observed data on improving the +trade-off between the BIC score and model complexity. Code is available at +https://github.com/Pongpisit-Thanasutives/UBIC. + +
+
+ comment: 17 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ When Deep Learning Meets Polyhedral Theory: A Survey + + +
+ In the past decade, deep learning became the prevalent methodology for +predictive modeling thanks to the remarkable accuracy of deep neural networks +in tasks such as computer vision and natural language processing. Meanwhile, +the structure of neural networks converged back to simpler representations +based on piecewise constant and piecewise linear functions such as the +Rectified Linear Unit (ReLU), which became the most commonly used type of +activation function in neural networks. That made certain types of network +structure $\unicode{x2014}$such as the typical fully-connected feedforward +neural network$\unicode{x2014}$ amenable to analysis through polyhedral theory +and to the application of methodologies such as Linear Programming (LP) and +Mixed-Integer Linear Programming (MILP) for a variety of purposes. In this +paper, we survey the main topics emerging from this fast-paced area of work, +which bring a fresh perspective to understanding neural networks in more detail +as well as to applying linear optimization techniques to train, verify, and +reduce the size of such networks. + +
+
+
+
+
+ + ♻ ☆ Transformer-based interpretable multi-modal data fusion for skin lesion + classification + + +
+ A lot of deep learning (DL) research these days is mainly focused on +improving quantitative metrics regardless of other factors. In human-centered +applications, like skin lesion classification in dermatology, DL-driven +clinical decision support systems are still in their infancy due to the limited +transparency of their decision-making process. Moreover, the lack of procedures +that can explain the behavior of trained DL algorithms leads to almost no trust +from clinical physicians. To diagnose skin lesions, dermatologists rely on +visual assessment of the disease and the data gathered from the patient's +anamnesis. Data-driven algorithms dealing with multi-modal data are limited by +the separation of feature-level and decision-level fusion procedures required +by convolutional architectures. To address this issue, we enable single-stage +multi-modal data fusion via the attention mechanism of transformer-based +architectures to aid in diagnosing skin diseases. Our method beats other +state-of-the-art single- and multi-modal DL architectures in image-rich and +patient-data-rich environments. Additionally, the choice of the architecture +enables native interpretability support for the classification task both in the +image and metadata domain with no additional modifications necessary. + +
+
+ comment: Submitted to IEEE JBHI in July 2023 +
+
+
+
+
+ + ♻ ☆ Invertible normalizing flow neural networks by JKO scheme + + +
+ Normalizing flow is a class of deep generative models for efficient sampling +and density estimation. In practice, the flow often appears as a chain of +invertible neural network blocks; to facilitate training, existing works have +regularized flow trajectories and designed special network architectures. The +current paper develops a neural ODE flow network inspired by the +Jordan-Kinderleherer-Otto (JKO) scheme, which allows efficient block-wise +training of the residual blocks without sampling SDE trajectories or inner +loops of score matching or variational learning. As the JKO scheme unfolds the +dynamic of gradient flow, the proposed model naturally stacks residual network +blocks one by one, reducing the memory load and difficulty in performing +end-to-end deep flow network training. We also develop adaptive time +reparameterization of the flow network with a progressive refinement of the +trajectory in probability space, which improves the model training efficiency +and accuracy in practice. Using numerical experiments with synthetic and real +data, we show that the proposed JKO-iFlow model achieves similar or better +performance in generating new samples compared with the existing flow and +diffusion models at a significantly reduced computational and memory cost. + +
+
+
+
+
+ + ♻ ☆ Data-driven Predictive Latency for 5G: A Theoretical and Experimental + Analysis Using Network Measurements + + +
+ The advent of novel 5G services and applications with binding latency +requirements and guaranteed Quality of Service (QoS) hastened the need to +incorporate autonomous and proactive decision-making in network management +procedures. The objective of our study is to provide a thorough analysis of +predictive latency within 5G networks by utilizing real-world network data that +is accessible to mobile network operators (MNOs). In particular, (i) we present +an analytical formulation of the user-plane latency as a Hypoexponential +distribution, which is validated by means of a comparative analysis with +empirical measurements, and (ii) we conduct experimental results of +probabilistic regression, anomaly detection, and predictive forecasting +leveraging on emerging domains in Machine Learning (ML), such as Bayesian +Learning (BL) and Machine Learning on Graphs (GML). We test our predictive +framework using data gathered from scenarios of vehicular mobility, dense-urban +traffic, and social gathering events. Our results provide valuable insights +into the efficacy of predictive algorithms in practical applications. + +
+
+
+
+
+ + ♻ ☆ Generative Sliced MMD Flows with Riesz Kernels + + +
+ Maximum mean discrepancy (MMD) flows suffer from high computational costs in +large scale computations. In this paper, we show that MMD flows with Riesz +kernels $K(x,y) = - \Vert x-y\Vert^r$, $r \in (0,2)$ have exceptional +properties which allow their efficient computation. We prove that the MMD of +Riesz kernels coincides with the MMD of their sliced version. As a consequence, +the computation of gradients of MMDs can be performed in the one-dimensional +setting. Here, for $r=1$, a simple sorting algorithm can be applied to reduce +the complexity from $O(MN+N^2)$ to $O((M+N)\log(M+N))$ for two measures with +$M$ and $N$ support points. As another interesting follow-up result, the MMD of +compactly supported measures can be estimated from above and below by the +Wasserstein-1 distance. For the implementations we approximate the gradient of +the sliced MMD by using only a finite number $P$ of slices. We show that the +resulting error has complexity $O(\sqrt{d/P})$, where $d$ is the data +dimension. These results enable us to train generative models by approximating +MMD gradient flows by neural networks even for image applications. We +demonstrate the efficiency of our model by image generation on MNIST, +FashionMNIST and CIFAR10. + +
+
+
+
+
+ + ♻ ☆ Leveraging Image-based Generative Adversarial Networks for Time Series + Generation + + +
+ Generative models for images have gained significant attention in computer +vision and natural language processing due to their ability to generate +realistic samples from complex data distributions. To leverage the advances of +image-based generative models for the time series domain, we propose a +two-dimensional image representation for time series, the Extended +Intertemporal Return Plot (XIRP). Our approach captures the intertemporal time +series dynamics in a scale-invariant and invertible way, reducing training time +and improving sample quality. We benchmark synthetic XIRPs obtained by an +off-the-shelf Wasserstein GAN with gradient penalty (WGAN-GP) to other image +representations and models regarding similarity and predictive ability metrics. +Our novel, validated image representation for time series consistently and +significantly outperforms a state-of-the-art RNN-based generative model +regarding predictive ability. Further, we introduce an improved stochastic +inversion to substantially improve simulation quality regardless of the +representation and provide the prospect of transfer potentials in other +domains. + +
+
+
+
+
+ + ♻ ☆ 0/1 Deep Neural Networks via Block Coordinate Descent + + +
+ The step function is one of the simplest and most natural activation +functions for deep neural networks (DNNs). As it counts 1 for positive +variables and 0 for others, its intrinsic characteristics (e.g., discontinuity +and no viable information of subgradients) impede its development for several +decades. Even if there is an impressive body of work on designing DNNs with +continuous activation functions that can be deemed as surrogates of the step +function, it is still in the possession of some advantageous properties, such +as complete robustness to outliers and being capable of attaining the best +learning-theoretic guarantee of predictive accuracy. Hence, in this paper, we +aim to train DNNs with the step function used as an activation function (dubbed +as 0/1 DNNs). We first reformulate 0/1 DNNs as an unconstrained optimization +problem and then solve it by a block coordinate descend (BCD) method. Moreover, +we acquire closed-form solutions for sub-problems of BCD as well as its +convergence properties. Furthermore, we also integrate +$\ell_{2,0}$-regularization into 0/1 DNN to accelerate the training process and +compress the network scale. As a result, the proposed algorithm has a high +performance on classifying MNIST and Fashion-MNIST datasets. As a result, the +proposed algorithm has a desirable performance on classifying MNIST, +FashionMNIST, Cifar10, and Cifar100 datasets. + +
+
+
+
+
+ + ♻ ☆ Principled Pruning of Bayesian Neural Networks through Variational Free + Energy Minimization + + +
+ Bayesian model reduction provides an efficient approach for comparing the +performance of all nested sub-models of a model, without re-evaluating any of +these sub-models. Until now, Bayesian model reduction has been applied mainly +in the computational neuroscience community on simple models. In this paper, we +formulate and apply Bayesian model reduction to perform principled pruning of +Bayesian neural networks, based on variational free energy minimization. Direct +application of Bayesian model reduction, however, gives rise to approximation +errors. Therefore, a novel iterative pruning algorithm is presented to +alleviate the problems arising with naive Bayesian model reduction, as +supported experimentally on the publicly available UCI datasets for different +inference algorithms. This novel parameter pruning scheme solves the +shortcomings of current state-of-the-art pruning methods that are used by the +signal processing community. The proposed approach has a clear stopping +criterion and minimizes the same objective that is used during training. Next +to these benefits, our experiments indicate better model performance in +comparison to state-of-the-art pruning schemes. + +
+
+
+
+
+ + ♻ ☆ The Role of Diverse Replay for Generalisation in Reinforcement Learning + + +
+ In reinforcement learning (RL), key components of many algorithms are the +exploration strategy and replay buffer. These strategies regulate what +environment data is collected and trained on and have been extensively studied +in the RL literature. In this paper, we investigate the impact of these +components in the context of generalisation in multi-task RL. We investigate +the hypothesis that collecting and training on more diverse data from the +training environments will improve zero-shot generalisation to new tasks. We +motivate mathematically and show empirically that generalisation to tasks that +are "reachable'' during training is improved by increasing the diversity of +transitions in the replay buffer. Furthermore, we show empirically that this +same strategy also shows improvement for generalisation to similar but +"unreachable'' tasks which could be due to improved generalisation of the +learned latent representations. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Yet Another ICU Benchmark: A Flexible Multi-Center Framework for + Clinical ML + + +
+ Medical applications of machine learning (ML) have experienced a surge in +popularity in recent years. The intensive care unit (ICU) is a natural habitat +for ML given the abundance of available data from electronic health records. +Models have been proposed to address numerous ICU prediction tasks like the +early detection of complications. While authors frequently report +state-of-the-art performance, it is challenging to verify claims of +superiority. Datasets and code are not always published, and cohort +definitions, preprocessing pipelines, and training setups are difficult to +reproduce. This work introduces Yet Another ICU Benchmark (YAIB), a modular +framework that allows researchers to define reproducible and comparable +clinical ML experiments; we offer an end-to-end solution from cohort definition +to model evaluation. The framework natively supports most open-access ICU +datasets (MIMIC III/IV, eICU, HiRID, AUMCdb) and is easily adaptable to future +ICU datasets. Combined with a transparent preprocessing pipeline and extensible +training code for multiple ML and deep learning models, YAIB enables unified +model development. Our benchmark comes with five predefined established +prediction tasks (mortality, acute kidney injury, sepsis, kidney function, and +length of stay) developed in collaboration with clinicians. Adding further +tasks is straightforward by design. Using YAIB, we demonstrate that the choice +of dataset, cohort definition, and preprocessing have a major impact on the +prediction performance - often more so than model class - indicating an urgent +need for YAIB as a holistic benchmarking tool. We provide our work to the +clinical ML community to accelerate method development and enable real-world +clinical implementations. Software Repository: +https://github.com/rvandewater/YAIB. + +
+
+ comment: Main benchmark: https://github.com/rvandewater/YAIB, Cohort + generation: https://github.com/rvandewater/YAIB-cohorts, Models: + https://github.com/rvandewater/YAIB-models +
+
+
+
+
+ + ♻ ☆ MaxViT-UNet: Multi-Axis Attention for Medical Image Segmentation + + +
+ In this work, we present MaxViT-UNet, an Encoder-Decoder based hybrid vision +transformer (CNN-Transformer) for medical image segmentation. The proposed +Hybrid Decoder, based on MaxViT-block, is designed to harness the power of both +the convolution and self-attention mechanisms at each decoding stage with a +nominal memory and computational burden. The inclusion of multi-axis +self-attention, within each decoder stage, significantly enhances the +discriminating capacity between the object and background regions, thereby +helping in improving the segmentation efficiency. In the Hybrid Decoder block, +the fusion process commences by integrating the upsampled lower-level decoder +features, obtained through transpose convolution, with the skip-connection +features derived from the hybrid encoder. Subsequently, the fused features +undergo refinement through the utilization of a multi-axis attention mechanism. +The proposed decoder block is repeated multiple times to progressively segment +the nuclei regions. Experimental results on MoNuSeg18 and MoNuSAC20 dataset +demonstrates the effectiveness of the proposed technique. Our MaxViT-UNet +outperformed the previous CNN-based (UNet) and Transformer-based (Swin-UNet) +techniques by a considerable margin on both of the standard datasets. The +following github (https://github.com/PRLAB21/MaxViT-UNet) contains the +implementation and trained weights. + +
+
+ comment: 17 pages, 6 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Collage Diffusion + + +
+ We seek to give users precise control over diffusion-based image generation +by modeling complex scenes as sequences of layers, which define the desired +spatial arrangement and visual attributes of objects in the scene. Collage +Diffusion harmonizes the input layers to make objects fit together -- the key +challenge involves minimizing changes in the positions and key visual +attributes of the input layers while allowing other attributes to change in the +harmonization process. We ensure that objects are generated in the correct +locations by modifying text-image cross-attention with the layers' alpha masks. +We preserve key visual attributes of input layers by learning specialized text +representations per layer and by extending ControlNet to operate on layers. +Layer input allows users to control the extent of image harmonization on a +per-object basis, and users can even iteratively edit individual objects in +generated images while keeping other objects fixed. By leveraging the rich +information present in layer input, Collage Diffusion generates globally +harmonized images that maintain desired object characteristics better than +prior approaches. + +
+
+
+
+
+ + ♻ ☆ Knowledge Enhanced Graph Neural Networks for Graph Completion + + +
+ Graph data is omnipresent and has a wide variety of applications, such as in +natural science, social networks, or the semantic web. However, while being +rich in information, graphs are often noisy and incomplete. As a result, graph +completion tasks, such as node classification or link prediction, have gained +attention. On one hand, neural methods, such as graph neural networks, have +proven to be robust tools for learning rich representations of noisy graphs. On +the other hand, symbolic methods enable exact reasoning on graphs.We propose +Knowledge Enhanced Graph Neural Networks (KeGNN), a neuro-symbolic framework +for graph completion that combines both paradigms as it allows for the +integration of prior knowledge into a graph neural network model.Essentially, +KeGNN consists of a graph neural network as a base upon which knowledge +enhancement layers are stacked with the goal of refining predictions with +respect to prior knowledge.We instantiate KeGNN in conjunction with two +state-of-the-art graph neural networks, Graph Convolutional Networks and Graph +Attention Networks, and evaluate KeGNN on multiple benchmark datasets for node +classification. + +
+
+
+
+
+ + ♻ ☆ StyleDiff: Attribute Comparison Between Unlabeled Datasets in Latent + Disentangled Space + + +
+ One major challenge in machine learning applications is coping with +mismatches between the datasets used in the development and those obtained in +real-world applications. These mismatches may lead to inaccurate predictions +and errors, resulting in poor product quality and unreliable systems. In this +study, we propose StyleDiff to inform developers of the differences between the +two datasets for the steady development of machine learning systems. Using +disentangled image spaces obtained from recently proposed generative models, +StyleDiff compares the two datasets by focusing on attributes in the images and +provides an easy-to-understand analysis of the differences between the +datasets. The proposed StyleDiff performs in $O (d N\log N)$, where $N$ is the +size of the datasets and $d$ is the number of attributes, enabling the +application to large datasets. We demonstrate that StyleDiff accurately detects +differences between datasets and presents them in an understandable format +using, for example, driving scenes datasets. + +
+
+ comment: 25 pages, 17 figures, Image and Vision Computing +
+
+
+
+
+ + ♻ ☆ Sensitivity-Aware Visual Parameter-Efficient Fine-Tuning ICCV 2023 + + +
+ Visual Parameter-Efficient Fine-Tuning (PEFT) has become a powerful +alternative for full fine-tuning so as to adapt pre-trained vision models to +downstream tasks, which only tunes a small number of parameters while freezing +the vast majority ones to ease storage burden and optimization difficulty. +However, existing PEFT methods introduce trainable parameters to the same +positions across different tasks depending solely on human heuristics and +neglect the domain gaps. To this end, we study where to introduce and how to +allocate trainable parameters by proposing a novel Sensitivity-aware visual +Parameter-efficient fine-Tuning (SPT) scheme, which adaptively allocates +trainable parameters to task-specific important positions given a desired +tunable parameter budget. Specifically, our SPT first quickly identifies the +sensitive parameters that require tuning for a given task in a data-dependent +way. Next, our SPT further boosts the representational capability for the +weight matrices whose number of sensitive parameters exceeds a pre-defined +threshold by utilizing existing structured tuning methods, e.g., LoRA [23] or +Adapter [22], to replace directly tuning the selected sensitive parameters +(unstructured tuning) under the budget. Extensive experiments on a wide range +of downstream recognition tasks show that our SPT is complementary to the +existing PEFT methods and largely boosts their performance, e.g., SPT improves +Adapter with supervised pre-trained ViT-B/16 backbone by 4.2% and 1.4% mean +Top-1 accuracy, reaching SOTA performance on FGVC and VTAB-1k benchmarks, +respectively. Source code is at https://github.com/ziplab/SPT + +
+
+ comment: ICCV 2023 Oral +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Embeddings in the Biomedical Domain: Are They Useful? A + Look at Link Prediction, Rule Learning, and Downstream Polypharmacy Tasks + + +
+ Knowledge graphs are powerful tools for representing and organising complex +biomedical data. Several knowledge graph embedding algorithms have been +proposed to learn from and complete knowledge graphs. However, a recent study +demonstrates the limited efficacy of these embedding algorithms when applied to +biomedical knowledge graphs, raising the question of whether knowledge graph +embeddings have limitations in biomedical settings. This study aims to apply +state-of-the-art knowledge graph embedding models in the context of a recent +biomedical knowledge graph, BioKG, and evaluate their performance and potential +downstream uses. We achieve a three-fold improvement in terms of performance +based on the HITS@10 score over previous work on the same biomedical knowledge +graph. Additionally, we provide interpretable predictions through a rule-based +method. We demonstrate that knowledge graph embedding models are applicable in +practice by evaluating the best-performing model on four tasks that represent +real-life polypharmacy situations. Results suggest that knowledge learnt from +large biomedical knowledge graphs can be transferred to such downstream use +cases. Our code is available at https://github.com/aryopg/biokge. + +
+
+
+
+
+ + ♻ ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Dynamic Data Augmentation via MCTS for Prostate MRI Segmentation + + +
+ Medical image data are often limited due to the expensive acquisition and +annotation process. Hence, training a deep-learning model with only raw data +can easily lead to overfitting. One solution to this problem is to augment the +raw data with various transformations, improving the model's ability to +generalize to new data. However, manually configuring a generic augmentation +combination and parameters for different datasets is non-trivial due to +inconsistent acquisition approaches and data distributions. Therefore, +automatic data augmentation is proposed to learn favorable augmentation +strategies for different datasets while incurring large GPU overhead. To this +end, we present a novel method, called Dynamic Data Augmentation (DDAug), which +is efficient and has negligible computation cost. Our DDAug develops a +hierarchical tree structure to represent various augmentations and utilizes an +efficient Monte-Carlo tree searching algorithm to update, prune, and sample the +tree. As a result, the augmentation pipeline can be optimized for each dataset +automatically. Experiments on multiple Prostate MRI datasets show that our +method outperforms the current state-of-the-art data augmentation strategies. + +
+
+
+
+
+ + ♻ ☆ Improving the Validity of Decision Trees as Explanations + + +
+ In classification and forecasting with tabular data, one often utilizes +tree-based models. Those can be competitive with deep neural networks on +tabular data [cf. Grinsztajn et al., NeurIPS 2022, arXiv:2207.08815] and, under +some conditions, explainable. The explainability depends on the depth of the +tree and the accuracy in each leaf of the tree. Decision trees containing +leaves with unbalanced accuracy can provide misleading explanations. +Low-accuracy leaves give less valid explanations, which could be interpreted as +unfairness among explanations. Here, we train a shallow tree with the objective +of minimizing the maximum misclassification error across each leaf node. Then, +we extend each leaf with a separate tree-based model. The shallow tree provides +a global explanation, while the overall statistical performance of the shallow +tree with extended leaves improves upon decision trees of unlimited depth +trained using classical methods (e.g., CART) and is comparable to +state-of-the-art methods (e.g., well-tuned XGBoost). + +
+
+
+
+
+ + ♻ ☆ Quantization-based Optimization with Perspective of Quantum Mechanics + + +
+ Statistical and stochastic analysis based on thermodynamics has been the main +analysis framework for stochastic global optimization. Recently, appearing +quantum annealing or quantum tunneling algorithm for global optimization, we +require a new researching framework for global optimization algorithms. In this +paper, we provide the analysis for quantization-based optimization based on the +Schr\"odinger equation to reveal what property in quantum mechanics enables +global optimization. We present that the tunneling effect derived by the +Schr\"odinger equation in quantization-based optimization enables to escape of +a local minimum. Additionally, we confirm that this tunneling effect is the +same property included in quantum mechanics-based global optimization. +Experiments with standard multi-modal benchmark functions represent that the +proposed analysis is valid. + +
+
+ comment: Preprint for ICTC conference (First Revision) +
+
+
+
+
+ + ♻ ☆ Online Distributed Learning with Quantized Finite-Time Coordination + + +
+ In this paper we consider online distributed learning problems. Online +distributed learning refers to the process of training learning models on +distributed data sources. In our setting a set of agents need to cooperatively +train a learning model from streaming data. Differently from federated +learning, the proposed approach does not rely on a central server but only on +peer-to-peer communications among the agents. This approach is often used in +scenarios where data cannot be moved to a centralized location due to privacy, +security, or cost reasons. In order to overcome the absence of a central +server, we propose a distributed algorithm that relies on a quantized, +finite-time coordination protocol to aggregate the locally trained models. +Furthermore, our algorithm allows for the use of stochastic gradients during +local training. Stochastic gradients are computed using a randomly sampled +subset of the local training data, which makes the proposed algorithm more +efficient and scalable than traditional gradient descent. In our paper, we +analyze the performance of the proposed algorithm in terms of the mean distance +from the online solution. Finally, we present numerical results for a logistic +regression task. + +
+
+ comment: To be presented at IEEE CDC'23 +
+
+
+
+
+ + ♻ ☆ Sequential Informed Federated Unlearning: Efficient and Provable Client + Unlearning in Federated Optimization + + +
+ The aim of Machine Unlearning (MU) is to provide theoretical guarantees on +the removal of the contribution of a given data point from a training +procedure. Federated Unlearning (FU) consists in extending MU to unlearn a +given client's contribution from a federated training routine. Current FU +approaches are generally not scalable, and do not come with sound theoretical +quantification of the effectiveness of unlearning. In this work we present +Informed Federated Unlearning (IFU), a novel efficient and quantifiable FU +approach. Upon unlearning request from a given client, IFU identifies the +optimal FL iteration from which FL has to be reinitialized, with unlearning +guarantees obtained through a randomized perturbation mechanism. The theory of +IFU is also extended to account for sequential unlearning requests. +Experimental results on different tasks and dataset show that IFU leads to more +efficient unlearning procedures as compared to basic re-training and +state-of-the-art FU approaches. + +
+
+
+
+
+ + ♻ ☆ Federated Adaptive Prompt Tuning for Multi-domain Collaborative Learning + + +
+ Federated learning (FL) enables multiple clients to collaboratively train a +global model without disclosing their data. Previous researches often require +training the complete model parameters. However, the emergence of powerful +pre-trained models makes it possible to achieve higher performance with fewer +learnable parameters in FL. In this paper, we propose a federated adaptive +prompt tuning algorithm, FedAPT, for multi-domain collaborative image +classification with powerful foundation models, like CLIP. Compared with direct +federated prompt tuning, our core idea is to adaptively unlock specific domain +knowledge for each test sample in order to provide them with personalized +prompts. To implement this idea, we design an adaptive prompt tuning module, +which consists of a meta prompt, an adaptive network, and some keys. The server +randomly generates a set of keys and assigns a unique key to each client. Then +all clients cooperatively train the global adaptive network and meta prompt +with the local datasets and the frozen keys. Ultimately, the global aggregation +model can assign a personalized prompt to CLIP based on the domain features of +each test sample. We perform extensive experiments on two multi-domain image +classification datasets across two different settings - supervised and +unsupervised. The results show that FedAPT can achieve better performance with +less than 10\% of the number of parameters of the fully trained model, and the +global model can perform well in diverse client domains simultaneously. + +
+
+
+
+
+ + ♻ ☆ RBA-GCN: Relational Bilevel Aggregation Graph Convolutional Network for + Emotion Recognition + + +
+ Emotion recognition in conversation (ERC) has received increasing attention +from researchers due to its wide range of applications.As conversation has a +natural graph structure,numerous approaches used to model ERC based on graph +convolutional networks (GCNs) have yielded significant results.However,the +aggregation approach of traditional GCNs suffers from the node information +redundancy problem,leading to node discriminant information +loss.Additionally,single-layer GCNs lack the capacity to capture long-range +contextual information from the graph. Furthermore,the majority of approaches +are based on textual modality or stitching together different modalities, +resulting in a weak ability to capture interactions between modalities. To +address these problems, we present the relational bilevel aggregation graph +convolutional network (RBA-GCN), which consists of three modules: the graph +generation module (GGM), similarity-based cluster building module (SCBM) and +bilevel aggregation module (BiAM). First, GGM constructs a novel graph to +reduce the redundancy of target node information.Then,SCBM calculates the node +similarity in the target node and its structural neighborhood, where noisy +information with low similarity is filtered out to preserve the discriminant +information of the node. Meanwhile, BiAM is a novel aggregation method that can +preserve the information of nodes during the aggregation process. This module +can construct the interaction between different modalities and capture +long-range contextual information based on similarity clusters. On both the +IEMOCAP and MELD datasets, the weighted average F1 score of RBA-GCN has a +2.17$\sim$5.21\% improvement over that of the most advanced method.Our code is +available at https://github.com/luftmenscher/RBA-GCN and our article with the +same name has been published in IEEE/ACM Transactions on Audio,Speech,and +Language Processing,vol.31,2023 + +
+
+
+
+
+ + ♻ ☆ Why Does Little Robustness Help? Understanding and Improving Adversarial + Transferability from Surrogate Training + + +
+ Adversarial examples (AEs) for DNNs have been shown to be transferable: AEs +that successfully fool white-box surrogate models can also deceive other +black-box models with different architectures. Although a bunch of empirical +studies have provided guidance on generating highly transferable AEs, many of +these findings lack explanations and even lead to inconsistent advice. In this +paper, we take a further step towards understanding adversarial +transferability, with a particular focus on surrogate aspects. Starting from +the intriguing little robustness phenomenon, where models adversarially trained +with mildly perturbed adversarial samples can serve as better surrogates, we +attribute it to a trade-off between two predominant factors: model smoothness +and gradient similarity. Our investigations focus on their joint effects, +rather than their separate correlations with transferability. Through a series +of theoretical and empirical analyses, we conjecture that the data distribution +shift in adversarial training explains the degradation of gradient similarity. +Building on these insights, we explore the impacts of data augmentation and +gradient regularization on transferability and identify that the trade-off +generally exists in the various training mechanisms, thus building a +comprehensive blueprint for the regulation mechanism behind transferability. +Finally, we provide a general route for constructing better surrogates to boost +transferability which optimizes both model smoothness and gradient similarity +simultaneously, e.g., the combination of input gradient regularization and +sharpness-aware minimization (SAM), validated by extensive experiments. In +summary, we call for attention to the united impacts of these two factors for +launching effective transfer attacks, rather than optimizing one while ignoring +the other, and emphasize the crucial role of manipulating surrogate models. + +
+
+ comment: IEEE Symposium on Security and Privacy (Oakland) 2024; Extended + version of camera-ready +
+
+
+
+
+ + ♻ ☆ Simulation-Based Optimization of User Interfaces for Quality-Assuring + Machine Learning Model Predictions + + +
+ Quality-sensitive applications of machine learning (ML) require quality +assurance (QA) by humans before the predictions of an ML model can be deployed. +QA for ML (QA4ML) interfaces require users to view a large amount of data and +perform many interactions to correct errors made by the ML model. An optimized +user interface (UI) can significantly reduce interaction costs. While UI +optimization can be informed by user studies evaluating design options, this +approach is not scalable because there are typically numerous small variations +that can affect the efficiency of a QA4ML interface. Hence, we propose using +simulation to evaluate and aid the optimization of QA4ML interfaces. In +particular, we focus on simulating the combined effects of human intelligence +in initiating appropriate interaction commands and machine intelligence in +providing algorithmic assistance for accelerating QA4ML processes. As QA4ML is +usually labor-intensive, we use the simulated task completion time as the +metric for UI optimization under different interface and algorithm setups. We +demonstrate the usage of this UI design method in several QA4ML applications. + +
+
+ comment: Published in ACM Transactions on Interactive Intelligent Systems +
+
+
+
+
+ + ♻ ☆ Extending regionalization algorithms to explore spatial process + heterogeneity + + +
+ In spatial regression models, spatial heterogeneity may be considered with +either continuous or discrete specifications. The latter is related to +delineation of spatially connected regions with homogeneous relationships +between variables (spatial regimes). Although various regionalization +algorithms have been proposed and studied in the field of spatial analytics, +methods to optimize spatial regimes have been largely unexplored. In this +paper, we propose two new algorithms for spatial regime delineation, two-stage +K-Models and Regional-K-Models. We also extend the classic Automatic Zoning +Procedure to spatial regression context. The proposed algorithms are applied to +a series of synthetic datasets and two real-world datasets. Results indicate +that all three algorithms achieve superior or comparable performance to +existing approaches, while the two-stage K-Models algorithm largely outperforms +existing approaches on model fitting, region reconstruction, and coefficient +estimation. Our work enriches the spatial analytics toolbox to explore spatial +heterogeneous processes. + +
+
+ comment: 25 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Symmetry-Preserving Program Representations for Learning Code Semantics + + +
+ Large Language Models (LLMs) have shown promise in automated program +reasoning, a crucial aspect of many security tasks. However, existing LLM +architectures for code are often borrowed from other domains like natural +language processing, raising concerns about their generalization and robustness +to unseen code. A key generalization challenge is to incorporate the knowledge +of code semantics, including control and data flow, into the LLM architectures. + Drawing inspiration from examples of convolution layers exploiting +translation symmetry, we explore how code symmetries can enhance LLM +architectures for program analysis and modeling. We present a rigorous +group-theoretic framework that formally defines code symmetries as +semantics-preserving transformations and provides techniques for precisely +reasoning about symmetry preservation within LLM architectures. Using this +framework, we introduce a novel variant of self-attention that preserves +program symmetries, demonstrating its effectiveness in generalization and +robustness through detailed experimental evaluations across different binary +and source code analysis tasks. Overall, our code symmetry framework offers +rigorous and powerful reasoning techniques that can guide the future +development of specialized LLMs for code and advance LLM-guided program +reasoning tasks. + +
+
+
+
+
+ + ♻ ☆ On-Demand Communication for Asynchronous Multi-Agent Bandits AISTATS 2023 + + +
+ This paper studies a cooperative multi-agent multi-armed stochastic bandit +problem where agents operate asynchronously -- agent pull times and rates are +unknown, irregular, and heterogeneous -- and face the same instance of a +K-armed bandit problem. Agents can share reward information to speed up the +learning process at additional communication costs. We propose ODC, an +on-demand communication protocol that tailors the communication of each pair of +agents based on their empirical pull times. ODC is efficient when the pull +times of agents are highly heterogeneous, and its communication complexity +depends on the empirical pull times of agents. ODC is a generic protocol that +can be integrated into most cooperative bandit algorithms without degrading +their performance. We then incorporate ODC into the natural extensions of UCB +and AAE algorithms and propose two communication-efficient cooperative +algorithms. Our analysis shows that both algorithms are near-optimal in regret. + +
+
+ comment: Accepted by AISTATS 2023 +
+
+
+
+
+ + ♻ ☆ Visual correspondence-based explanations improve AI robustness and + human-AI team accuracy NeurIPS 2022 + + +
+ Explaining artificial intelligence (AI) predictions is increasingly important +and even imperative in many high-stakes applications where humans are the +ultimate decision-makers. In this work, we propose two novel architectures of +self-interpretable image classifiers that first explain, and then predict (as +opposed to post-hoc explanations) by harnessing the visual correspondences +between a query image and exemplars. Our models consistently improve (by 1 to 4 +points) on out-of-distribution (OOD) datasets while performing marginally worse +(by 1 to 2 points) on in-distribution tests than ResNet-50 and a $k$-nearest +neighbor classifier (kNN). Via a large-scale, human study on ImageNet and CUB, +our correspondence-based explanations are found to be more useful to users than +kNN explanations. Our explanations help users more accurately reject AI's wrong +decisions than all other tested methods. Interestingly, for the first time, we +show that it is possible to achieve complementary human-AI team accuracy (i.e., +that is higher than either AI-alone or human-alone), in ImageNet and CUB image +classification tasks. + +
+
+ comment: NeurIPS 2022 conference paper +
+
+
+
+
+ + ♻ ☆ CongNaMul: A Dataset for Advanced Image Processing of Soybean Sprouts + + +
+ We present 'CongNaMul', a comprehensive dataset designed for various tasks in +soybean sprouts image analysis. The CongNaMul dataset is curated to facilitate +tasks such as image classification, semantic segmentation, decomposition, and +measurement of length and weight. The classification task provides four classes +to determine the quality of soybean sprouts: normal, broken, spotted, and +broken and spotted, for the development of AI-aided automatic quality +inspection technology. For semantic segmentation, images with varying +complexity, from single sprout images to images with multiple sprouts, along +with human-labelled mask images, are included. The label has 4 different +classes: background, head, body, tail. The dataset also provides images and +masks for the image decomposition task, including two separate sprout images +and their combined form. Lastly, 5 physical features of sprouts (head length, +body length, body thickness, tail length, weight) are provided for image-based +measurement tasks. This dataset is expected to be a valuable resource for a +wide range of research and applications in the advanced analysis of images of +soybean sprouts. Also, we hope that this dataset can assist researchers +studying classification, semantic segmentation, decomposition, and physical +feature measurement in other industrial fields, in evaluating their models. The +dataset is available at the authors' repository. (https://bhban.kr/data) + +
+
+ comment: Accepted to International Conference on ICT Convergence 2023 +
+
+
+
+
+ + ♻ ☆ pTSE: A Multi-model Ensemble Method for Probabilistic Time Series + Forecasting IJCAI 2023 + + +
+ Various probabilistic time series forecasting models have sprung up and shown +remarkably good performance. However, the choice of model highly relies on the +characteristics of the input time series and the fixed distribution that the +model is based on. Due to the fact that the probability distributions cannot be +averaged over different models straightforwardly, the current time series model +ensemble methods cannot be directly applied to improve the robustness and +accuracy of forecasting. To address this issue, we propose pTSE, a multi-model +distribution ensemble method for probabilistic forecasting based on Hidden +Markov Model (HMM). pTSE only takes off-the-shelf outputs from member models +without requiring further information about each model. Besides, we provide a +complete theoretical analysis of pTSE to prove that the empirical distribution +of time series subject to an HMM will converge to the stationary distribution +almost surely. Experiments on benchmarks show the superiority of pTSE overall +member models and competitive ensemble methods. + +
+
+ comment: The 32nd International Joint Conference on Artificial Intelligence + (IJCAI 2023) +
+
+
+
+
+ + ♻ ☆ Stochastic Configuration Machines for Industrial Artificial Intelligence + + +
+ Real-time predictive modelling with desired accuracy is highly expected in +industrial artificial intelligence (IAI), where neural networks play a key +role. Neural networks in IAI require powerful, high-performance computing +devices to operate a large number of floating point data. Based on stochastic +configuration networks (SCNs), this paper proposes a new randomized learner +model, termed stochastic configuration machines (SCMs), to stress effective +modelling and data size saving that are useful and valuable for industrial +applications. Compared to SCNs and random vector functional-link (RVFL) nets +with binarized implementation, the model storage of SCMs can be significantly +compressed while retaining favourable prediction performance. Besides the +architecture of the SCM learner model and its learning algorithm, as an +important part of this contribution, we also provide a theoretical basis on the +learning capacity of SCMs by analysing the model's complexity. Experimental +studies are carried out over some benchmark datasets and three industrial +applications. The results demonstrate that SCM has great potential for dealing +with industrial data analytics. + +
+
+ comment: 23 pages, 7 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ MGNN: Graph Neural Networks Inspired by Distance Geometry Problem KDD 2023 + + +
+ Graph Neural Networks (GNNs) have emerged as a prominent research topic in +the field of machine learning. Existing GNN models are commonly categorized +into two types: spectral GNNs, which are designed based on polynomial graph +filters, and spatial GNNs, which utilize a message-passing scheme as the +foundation of the model. For the expressive power and universality of spectral +GNNs, a natural approach is to improve the design of basis functions for better +approximation ability. As for spatial GNNs, models like Graph Isomorphism +Networks (GIN) analyze their expressive power based on Graph Isomorphism Tests. +Recently, there have been attempts to establish connections between spatial +GNNs and geometric concepts like curvature and cellular sheaves, as well as +physical phenomena like oscillators. However, despite the recent progress, +there is still a lack of comprehensive analysis regarding the universality of +spatial GNNs from the perspectives of geometry and physics. In this paper, we +propose MetricGNN (MGNN), a spatial GNN model inspired by the +congruent-insensitivity property of classifiers in the classification phase of +GNNs. We demonstrate that a GNN model is universal in the spatial domain if it +can generate embedding matrices that are congruent to any given embedding +matrix. This property is closely related to the Distance Geometry Problem +(DGP). Since DGP is an NP-Hard combinatorial optimization problem, we propose +optimizing an energy function derived from spring networks and the +Multi-Dimensional Scaling (MDS) problem. This approach also allows our model to +handle both homophilic and heterophilic graphs. Finally, we propose employing +the iteration method to optimize our energy function. We extensively evaluate +the effectiveness of our model through experiments conducted on both synthetic +and real-world datasets. Our code is available at: +https://github.com/GuanyuCui/MGNN. + +
+
+ comment: Accepted by KDD 2023 +
+
+
+
+
+ + ♻ ☆ Fair Attribute Completion on Graph with Missing Attributes + + +
+ Tackling unfairness in graph learning models is a challenging task, as the +unfairness issues on graphs involve both attributes and topological structures. +Existing work on fair graph learning simply assumes that attributes of all +nodes are available for model training and then makes fair predictions. In +practice, however, the attributes of some nodes might not be accessible due to +missing data or privacy concerns, which makes fair graph learning even more +challenging. In this paper, we propose FairAC, a fair attribute completion +method, to complement missing information and learn fair node embeddings for +graphs with missing attributes. FairAC adopts an attention mechanism to deal +with the attribute missing problem and meanwhile, it mitigates two types of +unfairness, i.e., feature unfairness from attributes and topological unfairness +due to attribute completion. FairAC can work on various types of homogeneous +graphs and generate fair embeddings for them and thus can be applied to most +downstream tasks to improve their fairness performance. To our best knowledge, +FairAC is the first method that jointly addresses the graph attribution +completion and graph unfairness problems. Experimental results on benchmark +datasets show that our method achieves better fairness performance with less +sacrifice in accuracy, compared with the state-of-the-art methods of fair graph +learning. Code is available at: https://github.com/donglgcn/FairAC. + +
+
+
+
+
+ + ♻ ☆ Backpropagation through Back Substitution with a Backslash + + +
+ We present a linear algebra formulation of backpropagation which allows the +calculation of gradients by using a generically written ``backslash'' or +Gaussian elimination on triangular systems of equations. Generally, the matrix +elements are operators. This paper has three contributions: (i) it is of +intellectual value to replace traditional treatments of automatic +differentiation with a (left acting) operator theoretic, graph-based approach; +(ii) operators can be readily placed in matrices in software in programming +languages such as Julia as an implementation option; (iii) we introduce a novel +notation, ``transpose dot'' operator ``$\{\}^{T_\bullet}$'' that allows for the +reversal of operators. + We further demonstrate the elegance of the operators approach in a suitable +programming language consisting of generic linear algebra operators such as +Julia \cite{bezanson2017julia}, and that it is possible to realize this +abstraction in code. Our implementation shows how generic linear algebra can +allow operators as elements of matrices. In contrast to ``operator +overloading,'' where backslash would normally have to be rewritten to take +advantage of operators, with ``generic programming'' there is no such need. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Neuronal diversity can improve machine learning for physics and beyond + + +
+ Diversity conveys advantages in nature, yet homogeneous neurons typically +comprise the layers of artificial neural networks. Here we construct neural +networks from neurons that learn their own activation functions, quickly +diversify, and subsequently outperform their homogeneous counterparts on image +classification and nonlinear regression tasks. Sub-networks instantiate the +neurons, which meta-learn especially efficient sets of nonlinear responses. +Examples include conventional neural networks classifying digits and +forecasting a van der Pol oscillator and physics-informed Hamiltonian neural +networks learning H\'enon-Heiles stellar orbits and the swing of a video +recorded pendulum clock. Such \textit{learned diversity} provides examples of +dynamical systems selecting diversity over uniformity and elucidates the role +of diversity in natural and artificial systems. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Learning Optimal Strategies for Temporal Tasks in Stochastic Games + + +
+ Synthesis from linear temporal logic (LTL) specifications provides assured +controllers for systems operating in stochastic and potentially adversarial +environments. Automatic synthesis tools, however, require a model of the +environment to construct controllers. In this work, we introduce a model-free +reinforcement learning (RL) approach to derive controllers from given LTL +specifications even when the environment is completely unknown. We model the +problem as a stochastic game (SG) between the controller and the adversarial +environment; we then learn optimal control strategies that maximize the +probability of satisfying the LTL specifications against the worst-case +environment behavior. We first construct a product game using the deterministic +parity automaton (DPA) translated from the given LTL specification. By deriving +distinct rewards and discount factors from the acceptance condition of the DPA, +we reduce the maximization of the worst-case probability of satisfying the LTL +specification into the maximization of a discounted reward objective in the +product game; this enables the use of model-free RL algorithms to learn an +optimal controller strategy. To deal with the common scalability problems when +the number of sets defining the acceptance condition of the DPA (usually +referred as colors), is large, we propose a lazy color generation method where +distinct rewards and discount factors are utilized only when needed, and an +approximate method where the controller eventually focuses on only one color. +In several case studies, we show that our approach is scalable to a wide range +of LTL formulas, significantly outperforming existing methods for learning +controllers from LTL specifications in SGs. + +
+
+
+
+
+ + ♻ ☆ Feature Extractor Stacking for Cross-domain Few-shot Meta-learning + + +
+ Cross-domain few-shot meta-learning (CDFSML) addresses learning problems +where knowledge needs to be transferred from several source domains into an +instance-scarce target domain with an explicitly different distribution. +Recently published CDFSML methods generally construct a universal model that +combines knowledge of multiple source domains into one backbone feature +extractor. This enables efficient inference but necessitates re-computation of +the backbone whenever a new source domain is added. Some of these methods are +also incompatible with heterogeneous source domain backbone architectures. We +propose feature extractor stacking (FES), a new CDFSML method for combining +information from a collection of backbones, which can utilise heterogeneous +pretrained backbones out of the box, and does not maintain a universal model +that needs to be re-computed when its backbone collection is updated. We +present the basic FES algorithm, which is inspired by the classic stacking +approach to meta-learning, and also introduce two variants: convolutional FES +(ConFES) and regularised FES (ReFES). Given a target-domain task, these +algorithms fine-tune each backbone independently, use cross-validation to +extract meta training data from the support set, and learn a simple linear +meta-classifier from this data. We evaluate our FES methods on the well-known +Meta-Dataset benchmark, targeting image classification with convolutional +neural networks, and show that they can achieve state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ Model Stitching: Looking For Functional Similarity Between + Representations + + +
+ Model stitching (Lenc & Vedaldi 2015) is a compelling methodology to compare +different neural network representations, because it allows us to measure to +what degree they may be interchanged. We expand on a previous work from Bansal, +Nakkiran & Barak which used model stitching to compare representations of the +same shapes learned by differently seeded and/or trained neural networks of the +same architecture. Our contribution enables us to compare the representations +learned by layers with different shapes from neural networks with different +architectures. We subsequently reveal unexpected behavior of model stitching. +Namely, we find that stitching, based on convolutions, for small ResNets, can +reach high accuracy if those layers come later in the first (sender) network +than in the second (receiver), even if those layers are far apart. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ VDM++: Variational Diffusion Models for High-Quality Synthesis + + +
+ To achieve the highest perceptual quality, state-of-the-art diffusion models +are optimized with objectives that typically look very different from the +maximum likelihood and the Evidence Lower Bound (ELBO) objectives. In this +work, we reveal that diffusion model objectives are actually closely related to +the ELBO. + Specifically, we show that all commonly used diffusion model objectives +equate to a weighted integral of ELBOs over different noise levels, where the +weighting depends on the specific objective used. Under the condition of +monotonic weighting, the connection is even closer: the diffusion objective +then equals the ELBO, combined with simple data augmentation, namely Gaussian +noise perturbation. We show that this condition holds for a number of +state-of-the-art diffusion models. + In experiments, we explore new monotonic weightings and demonstrate their +effectiveness, achieving state-of-the-art FID scores on the high-resolution +ImageNet benchmark. + +
+
+
+
+
+ + ♻ ☆ Human Comprehensible Active Learning of Genome-Scale Metabolic Networks AAAI + + +
+ An important application of Synthetic Biology is the engineering of the host +cell system to yield useful products. However, an increase in the scale of the +host system leads to huge design space and requires a large number of +validation trials with high experimental costs. A comprehensible machine +learning approach that efficiently explores the hypothesis space and guides +experimental design is urgently needed for the Design-Build-Test-Learn (DBTL) +cycle of the host cell system. We introduce a novel machine learning framework +ILP-iML1515 based on Inductive Logic Programming (ILP) that performs abductive +logical reasoning and actively learns from training examples. In contrast to +numerical models, ILP-iML1515 is built on comprehensible logical +representations of a genome-scale metabolic model and can update the model by +learning new logical structures from auxotrophic mutant trials. The ILP-iML1515 +framework 1) allows high-throughput simulations and 2) actively selects +experiments that reduce the experimental cost of learning gene functions in +comparison to randomly selected experiments. + +
+
+ comment: Invited presentation for AAAI Spring Symposium Series 2023 on + Computational Scientific Discovery +
+
+
+
+
+ + ♻ ☆ On Reward Structures of Markov Decision Processes + + +
+ A Markov decision process can be parameterized by a transition kernel and a +reward function. Both play essential roles in the study of reinforcement +learning as evidenced by their presence in the Bellman equations. In our +inquiry of various kinds of "costs" associated with reinforcement learning +inspired by the demands in robotic applications, rewards are central to +understanding the structure of a Markov decision process and reward-centric +notions can elucidate important concepts in reinforcement learning. + Specifically, we study the sample complexity of policy evaluation and develop +a novel estimator with an instance-specific error bound of +$\tilde{O}(\sqrt{\frac{\tau_s}{n}})$ for estimating a single state value. Under +the online regret minimization setting, we refine the transition-based MDP +constant, diameter, into a reward-based constant, maximum expected hitting +cost, and with it, provide a theoretical explanation for how a well-known +technique, potential-based reward shaping, could accelerate learning with +expert knowledge. In an attempt to study safe reinforcement learning, we model +hazardous environments with irrecoverability and proposed a quantitative notion +of safe learning via reset efficiency. In this setting, we modify a classic +algorithm to account for resets achieving promising preliminary numerical +results. Lastly, for MDPs with multiple reward functions, we develop a planning +algorithm that computationally efficiently finds Pareto-optimal stochastic +policies. + +
+
+ comment: This PhD thesis draws heavily from arXiv:1907.02114 and + arXiv:2002.06299; minor edits +
+
+
+
+
+ + ♻ ☆ Multi-Label Clinical Time-Series Generation via Conditional GAN + + +
+ In recent years, deep learning has been successfully adopted in a wide range +of applications related to electronic health records (EHRs) such as +representation learning and clinical event prediction. However, due to privacy +constraints, limited access to EHR becomes a bottleneck for deep learning +research. To mitigate these concerns, generative adversarial networks (GANs) +have been successfully used for generating EHR data. However, there are still +challenges in high-quality EHR generation, including generating time-series EHR +data and imbalanced uncommon diseases. In this work, we propose a Multi-label +Time-series GAN (MTGAN) to generate EHR and simultaneously improve the quality +of uncommon disease generation. The generator of MTGAN uses a gated recurrent +unit (GRU) with a smooth conditional matrix to generate sequences and uncommon +diseases. The critic gives scores using Wasserstein distance to recognize real +samples from synthetic samples by considering both data and temporal features. +We also propose a training strategy to calculate temporal features for real +data and stabilize GAN training. Furthermore, we design multiple statistical +metrics and prediction tasks to evaluate the generated data. Experimental +results demonstrate the quality of the synthetic data and the effectiveness of +MTGAN in generating realistic sequential EHR data, especially for uncommon +diseases. + +
+
+ comment: \c{opyright}2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ SGD learning on neural networks: leap complexity and saddle-to-saddle + dynamics + + +
+ We investigate the time complexity of SGD learning on fully-connected neural +networks with isotropic data. We put forward a complexity measure -- the leap +-- which measures how "hierarchical" target functions are. For $d$-dimensional +uniform Boolean or isotropic Gaussian data, our main conjecture states that the +time complexity to learn a function $f$ with low-dimensional support is +$\tilde\Theta (d^{\max(\mathrm{Leap}(f),2)})$. We prove a version of this +conjecture for a class of functions on Gaussian isotropic data and 2-layer +neural networks, under additional technical assumptions on how SGD is run. We +show that the training sequentially learns the function support with a +saddle-to-saddle dynamic. Our result departs from [Abbe et al. 2022] by going +beyond leap 1 (merged-staircase functions), and by going beyond the mean-field +and gradient flow approximations that prohibit the full complexity control +obtained here. Finally, we note that this gives an SGD complexity for the full +training trajectory that matches that of Correlational Statistical Query (CSQ) +lower-bounds. + +
+
+
+
+
+ + ♻ ☆ Fair admission risk prediction with proportional multicalibration + + +
+ Fair calibration is a widely desirable fairness criteria in risk prediction +contexts. One way to measure and achieve fair calibration is with +multicalibration. Multicalibration constrains calibration error among +flexibly-defined subpopulations while maintaining overall calibration. However, +multicalibrated models can exhibit a higher percent calibration error among +groups with lower base rates than groups with higher base rates. As a result, +it is possible for a decision-maker to learn to trust or distrust model +predictions for specific groups. To alleviate this, we propose +\emph{proportional multicalibration}, a criteria that constrains the percent +calibration error among groups and within prediction bins. We prove that +satisfying proportional multicalibration bounds a model's multicalibration as +well its \emph{differential calibration}, a fairness criteria that directly +measures how closely a model approximates sufficiency. Therefore, +proportionally calibrated models limit the ability of decision makers to +distinguish between model performance on different patient groups, which may +make the models more trustworthy in practice. We provide an efficient algorithm +for post-processing risk prediction models for proportional multicalibration +and evaluate it empirically. We conduct simulation studies and investigate a +real-world application of PMC-postprocessing to prediction of emergency +department patient admissions. We observe that proportional multicalibration is +a promising criteria for controlling simultaneous measures of calibration +fairness of a model over intersectional groups with virtually no cost in terms +of classification performance. + +
+
+ comment: Published in the 2023 Conference on Health, Inference, and Learning + (CHIL). Best paper award +
+
+
+
+
+ + ♻ ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Model sparsification in deep learning promotes simpler, more interpretable +models with fewer parameters. This not only reduces the model's memory +footprint and computational needs but also shortens inference time. This work +focuses on creating sparse models optimized for multiple tasks with fewer +parameters. These parsimonious models also possess the potential to match or +outperform dense models in terms of performance. In this work, we introduce +channel-wise l1/l2 group sparsity in the shared convolutional layers parameters +(or weights) of the multi-task learning model. This approach facilitates the +removal of extraneous groups i.e., channels (due to l1 regularization) and also +imposes a penalty on the weights, further enhancing the learning efficiency for +all tasks (due to l2 regularization). We analyzed the results of group sparsity +in both single-task and multi-task settings on two widely-used Multi-Task +Learning (MTL) datasets: NYU-v2 and CelebAMask-HQ. On both datasets, which +consist of three different computer vision tasks each, multi-task models with +approximately 70% sparsity outperform their dense equivalents. We also +investigate how changing the degree of sparsification influences the model's +performance, the overall sparsity percentage, the patterns of sparsity, and the +inference time. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ The Rich Get Richer: Disparate Impact of Semi-Supervised Learning ICLR 2022 + + +
+ Semi-supervised learning (SSL) has demonstrated its potential to improve the +model accuracy for a variety of learning tasks when the high-quality supervised +data is severely limited. Although it is often established that the average +accuracy for the entire population of data is improved, it is unclear how SSL +fares with different sub-populations. Understanding the above question has +substantial fairness implications when different sub-populations are defined by +the demographic groups that we aim to treat fairly. In this paper, we reveal +the disparate impacts of deploying SSL: the sub-population who has a higher +baseline accuracy without using SSL (the "rich" one) tends to benefit more from +SSL; while the sub-population who suffers from a low baseline accuracy (the +"poor" one) might even observe a performance drop after adding the SSL module. +We theoretically and empirically establish the above observation for a broad +family of SSL algorithms, which either explicitly or implicitly use an +auxiliary "pseudo-label". Experiments on a set of image and text classification +tasks confirm our claims. We introduce a new metric, Benefit Ratio, and promote +the evaluation of the fairness of SSL (Equalized Benefit Ratio). We further +discuss how the disparate impact can be mitigated. We hope our paper will alarm +the potential pitfall of using SSL and encourage a multifaceted evaluation of +future SSL algorithms. + +
+
+ comment: Published as a conference paper at ICLR 2022. Revised constants + Theorems 1,2, and Lemma 3 (consider the union bound). Add acknowledgments to + Nautilus +
+
+
+
+
+ + ♻ ☆ Best Practices for Noise-Based Augmentation to Improve the Performance + of Deployable Speech-Based Emotion Recognition Systems + + +
+ Speech emotion recognition is an important component of any human centered +system. But speech characteristics produced and perceived by a person can be +influenced by a multitude of reasons, both desirable such as emotion, and +undesirable such as noise. To train robust emotion recognition models, we need +a large, yet realistic data distribution, but emotion datasets are often small +and hence are augmented with noise. Often noise augmentation makes one +important assumption, that the prediction label should remain the same in +presence or absence of noise, which is true for automatic speech recognition +but not necessarily true for perception based tasks. In this paper we make +three novel contributions. We validate through crowdsourcing that the presence +of noise does change the annotation label and hence may alter the original +ground truth label. We then show how disregarding this knowledge and assuming +consistency in ground truth labels propagates to downstream evaluation of ML +models, both for performance evaluation and robustness testing. We end the +paper with a set of recommendations for noise augmentations in speech emotion +recognition datasets. + +
+
+
+
+
+ + ♻ ☆ Efficient Benchmarking (of Language Models) + + +
+ The increasing versatility of language models LMs has given rise to a new +class of benchmarks that comprehensively assess a broad range of capabilities. +Such benchmarks are associated with massive computational costs reaching +thousands of GPU hours per model. However the efficiency aspect of these +evaluation efforts had raised little discussion in the literature. In this work +we present the problem of Efficient Benchmarking namely intelligently reducing +the computation costs of LM evaluation without compromising reliability. Using +the HELM benchmark as a test case we investigate how different benchmark design +choices affect the computation-reliability tradeoff. We propose to evaluate the +reliability of such decisions by using a new measure Decision Impact on +Reliability DIoR for short. We find for example that the current leader on HELM +may change by merely removing a low-ranked model from the benchmark and observe +that a handful of examples suffice to obtain the correct benchmark ranking. +Conversely a slightly different choice of HELM scenarios varies ranking widely. +Based on our findings we outline a set of concrete recommendations for more +efficient benchmark design and utilization practices leading to dramatic cost +savings with minimal loss of benchmark reliability often reducing computation +by x100 or more. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Terrain Diffusion Network: Climatic-Aware Terrain Generation with + Geological Sketch Guidance + + +
+ Sketch-based terrain generation seeks to create realistic landscapes for +virtual environments in various applications such as computer games, animation +and virtual reality. Recently, deep learning based terrain generation has +emerged, notably the ones based on generative adversarial networks (GAN). +However, these methods often struggle to fulfill the requirements of flexible +user control and maintain generative diversity for realistic terrain. +Therefore, we propose a novel diffusion-based method, namely terrain diffusion +network (TDN), which actively incorporates user guidance for enhanced +controllability, taking into account terrain features like rivers, ridges, +basins, and peaks. Instead of adhering to a conventional monolithic denoising +process, which often compromises the fidelity of terrain details or the +alignment with user control, a multi-level denoising scheme is proposed to +generate more realistic terrains by taking into account fine-grained details, +particularly those related to climatic patterns influenced by erosion and +tectonic activities. Specifically, three terrain synthesisers are designed for +structural, intermediate, and fine-grained level denoising purposes, which +allow each synthesiser concentrate on a distinct terrain aspect. Moreover, to +maximise the efficiency of our TDN, we further introduce terrain and sketch +latent spaces for the synthesizers with pre-trained terrain autoencoders. +Comprehensive experiments on a new dataset constructed from NASA Topology +Images clearly demonstrate the effectiveness of our proposed method, achieving +the state-of-the-art performance. Our code and dataset will be publicly +available. + +
+
+
+
+
+ + ☆ End-Edge Coordinated Joint Encoding and Neural Enhancement for Low-Light + Video Analytics + + +
+ In this paper, we investigate video analytics in low-light environments, and +propose an end-edge coordinated system with joint video encoding and +enhancement. It adaptively transmits low-light videos from cameras and performs +enhancement and inference tasks at the edge. Firstly, according to our +observations, both encoding and enhancement for low-light videos have a +significant impact on inference accuracy, which directly influences bandwidth +and computation overhead. Secondly, due to the limitation of built-in +computation resources, cameras perform encoding and transmitting frames to the +edge. The edge executes neural enhancement to process low contrast, detail +loss, and color distortion on low-light videos before inference. Finally, an +adaptive controller is designed at the edge to select quantization parameters +and scales of neural enhancement networks, aiming to improve the inference +accuracy and meet the latency requirements. Extensive real-world experiments +demon-strate that, the proposed system can achieve a better trade-off between +communication and computation resources and optimize the inference accuracy. + +
+
+
+
+
+ + ☆ Edge-Assisted Lightweight Region-of-Interest Extraction and Transmission + for Vehicle Perception + + +
+ To enhance on-road environmental perception for autonomous driving, accurate +and real-time analytics on high-resolution video frames generated from on-board +cameras be-comes crucial. In this paper, we design a lightweight object +location method based on class activation mapping (CAM) to rapidly capture the +region of interest (RoI) boxes that contain driving safety related objects from +on-board cameras, which can not only improve the inference accuracy of vision +tasks, but also reduce the amount of transmitted data. Considering the limited +on-board computation resources, the RoI boxes extracted from the raw image are +offloaded to the edge for further processing. Considering both the dynamics of +vehicle-to-edge communications and the limited edge resources, we propose an +adaptive RoI box offloading algorithm to ensure prompt and accurate inference +by adjusting the down-sampling rate of each box. Extensive experimental results +on four high-resolution video streams demonstrate that our approach can +effectively improve the overall accuracy by up to 16% and reduce the +transmission demand by up to 49%, compared with other benchmarks. + +
+
+
+
+
+ + ☆ Edge-Assisted On-Device Model Update for Video Analytics in Adverse + Environments + + +
+ While large deep neural networks excel at general video analytics tasks, the +significant demand on computing capacity makes them infeasible for real-time +inference on resource-constrained end cam-eras. In this paper, we propose an +edge-assisted framework that continuously updates the lightweight model +deployed on the end cameras to achieve accurate predictions in adverse +environments. This framework consists of three modules, namely, a key frame +extractor, a trigger controller, and a retraining manager. The low-cost key +frame extractor obtains frames that can best represent the current environment. +Those frames are then transmitted and buffered as the retraining data for model +update at the edge server. Once the trigger controller detects a significant +accuracy drop in the selected frames, the retraining manager outputs the +optimal retraining configuration balancing the accuracy and time cost. We +prototype our system on two end devices of different computing capacities with +one edge server. The results demonstrate that our approach significantly +improves accuracy across all tested adverse environment scenarios (up to 24%) +and reduces more than 50% of the retraining time compared to existing +benchmarks. + +
+
+
+
+
+ + ☆ Separate and Locate: Rethink the Text in Text-based Visual Question + Answering ACM MM 2023 + + +
+ Text-based Visual Question Answering (TextVQA) aims at answering questions +about the text in images. Most works in this field focus on designing network +structures or pre-training tasks. All these methods list the OCR texts in +reading order (from left to right and top to bottom) to form a sequence, which +is treated as a natural language ``sentence''. However, they ignore the fact +that most OCR words in the TextVQA task do not have a semantical contextual +relationship. In addition, these approaches use 1-D position embedding to +construct the spatial relation between OCR tokens sequentially, which is not +reasonable. The 1-D position embedding can only represent the left-right +sequence relationship between words in a sentence, but not the complex spatial +position relationship. To tackle these problems, we propose a novel method +named Separate and Locate (SaL) that explores text contextual cues and designs +spatial position embedding to construct spatial relations between OCR texts. +Specifically, we propose a Text Semantic Separate (TSS) module that helps the +model recognize whether words have semantic contextual relations. Then, we +introduce a Spatial Circle Position (SCP) module that helps the model better +construct and reason the spatial position relationships between OCR texts. Our +SaL model outperforms the baseline model by 4.44% and 3.96% accuracy on TextVQA +and ST-VQA datasets. Compared with the pre-training state-of-the-art method +pre-trained on 64 million pre-training samples, our method, without any +pre-training tasks, still achieves 2.68% and 2.52% accuracy improvement on +TextVQA and ST-VQA. Our code and models will be released at +https://github.com/fangbufang/SaL. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Multi-Modal Discussion Transformer: Integrating Text, Images and Graph + Transformers to Detect Hate Speech on Social Media + + +
+ We present the Multi-Modal Discussion Transformer (mDT), a novel multi-modal +graph-based transformer model for detecting hate speech in online social +networks, such as Reddit discussions. In contrast to traditional comment-only +methods, our approach to labelling a comment as hate speech involves a holistic +analysis of text and images grounded in the discussion context. This is done by +leveraging graph transformers to capture the contextual relationships in the +entire discussion surrounding a comment and grounding the interwoven fusion +layers that combine individual comments' text and image embeddings instead of +processing modalities separately. We compare the performance of our model to +baselines that only process individual comments and conduct extensive ablation +studies. To evaluate our work, we present a new dataset, HatefulDiscussions, +comprising complete multi-modal discussions from multiple online communities on +Reddit. We conclude with future work for multimodal solutions to deliver social +value in online contexts, arguing that capturing a holistic view of a +conversation significantly advances the effort to detect anti-social behaviour. + +
+
+ comment: Under Submission +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Foundation Models utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. In this paper, we propose a +new framework that includes the Domain Foundation Model (DFM), bridging the gap +between the General Foundation Model (GFM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +tried several Parameter-Efficient Fine-Tuning methods on RS5M to implement the +DFM. Experimental results show that our proposed dataset are highly effective +for various tasks, improving upon the baseline by $8 \% \sim 16 \%$ in +zero-shot classification tasks, and obtaining good results in both +Vision-Language Retrieval and Semantic Localization tasks. +\url{https://github.com/om-ai-lab/RS5M} + +
+
+ comment: RS5M dataset v4 +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`